Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
Commit ·
b354c96
1
Parent(s): 3b14ed8
replace static plots with dynamic d3 embeds
Browse files- app/src/content/assets/image/newplot_2c31384e-bcac-800b-82e8-ff44228f7720.png +0 -3
- app/src/content/assets/image/newplot_2c41384e-bcac-8073-9395-cf2d0e901187.png +0 -3
- app/src/content/assets/image/newplot_2d21384e-bcac-80ab-a6dd-e31a6c150e61.png +0 -3
- app/src/content/assets/image/newplot_2d61384e-bcac-8092-baca-c17346b95734.png +0 -3
- app/src/content/assets/image/newplot_2da1384e-bcac-80d6-a8b9-da80324f8fef.png +0 -3
- app/src/content/assets/image/newplot_2df1384e-bcac-8010-abe7-cf477262b8d6.png +0 -3
- app/src/content/assets/image/newplot_2df1384e-bcac-8018-b1f6-da1dcde1f90a.png +0 -3
- app/src/content/assets/image/newplot_2df1384e-bcac-80bc-b93c-ee8e9cfd5529.png +0 -3
- app/src/content/assets/image/newplot_2e01384e-bcac-8017-9829-cd0c1db928c6.png +0 -3
- app/src/content/assets/image/newplot_2e01384e-bcac-806f-8bf1-f7e5405a2ff9.png +0 -3
- app/src/content/assets/image/newplot_2e11384e-bcac-800a-abc6-d0690da3f955.png +0 -3
- app/src/content/assets/image/newplot_2e11384e-bcac-8032-9835-e1407f4d780d.png +0 -3
- app/src/content/assets/image/newplot_2e11384e-bcac-80a3-a6fa-e8634e0e2206.png +0 -3
- app/src/content/assets/image/newplot_2e11384e-bcac-80bc-810d-d13554c628dc.png +0 -3
- app/src/content/assets/image/newplot_2e11384e-bcac-80dd-972d-cf77d9c3b004.png +0 -3
- app/src/content/assets/image/newplot_2e11384e-bcac-80ea-88cc-c971b2816596.png +0 -3
- app/src/content/assets/image/newplot_2e21384e-bcac-80a2-9bac-c543304d926e.png +0 -3
- app/src/content/assets/image/newplot_2e41384e-bcac-8065-b313-c38a6db4ac31.png +0 -3
- app/src/content/assets/image/newplot_2e41384e-bcac-80c0-aef5-e71fdbaccd8d.png +0 -3
- app/src/content/assets/image/newplot_2e71384e-bcac-8027-ae32-c133627ede4a.png +0 -3
- app/src/content/assets/image/newplot_2ee1384e-bcac-80da-82cd-df97247e2e72.png +0 -3
- app/src/content/assets/image/newplot_2f61384e-bcac-80d9-ab81-d57a228847cf.png +0 -3
- app/src/content/assets/image/newplot_2f71384e-bcac-80c6-a99e-f52084fc497b.png +0 -3
- app/src/content/assets/image/newplot_2f71384e-bcac-80d8-9985-e195d39f1e70.png +0 -3
- app/src/content/chapters/experiments.mdx +408 -49
app/src/content/assets/image/newplot_2c31384e-bcac-800b-82e8-ff44228f7720.png
DELETED
Git LFS Details
|
app/src/content/assets/image/newplot_2c41384e-bcac-8073-9395-cf2d0e901187.png
DELETED
Git LFS Details
|
app/src/content/assets/image/newplot_2d21384e-bcac-80ab-a6dd-e31a6c150e61.png
DELETED
Git LFS Details
|
app/src/content/assets/image/newplot_2d61384e-bcac-8092-baca-c17346b95734.png
DELETED
Git LFS Details
|
app/src/content/assets/image/newplot_2da1384e-bcac-80d6-a8b9-da80324f8fef.png
DELETED
Git LFS Details
|
app/src/content/assets/image/newplot_2df1384e-bcac-8010-abe7-cf477262b8d6.png
DELETED
Git LFS Details
|
app/src/content/assets/image/newplot_2df1384e-bcac-8018-b1f6-da1dcde1f90a.png
DELETED
Git LFS Details
|
app/src/content/assets/image/newplot_2df1384e-bcac-80bc-b93c-ee8e9cfd5529.png
DELETED
Git LFS Details
|
app/src/content/assets/image/newplot_2e01384e-bcac-8017-9829-cd0c1db928c6.png
DELETED
Git LFS Details
|
app/src/content/assets/image/newplot_2e01384e-bcac-806f-8bf1-f7e5405a2ff9.png
DELETED
Git LFS Details
|
app/src/content/assets/image/newplot_2e11384e-bcac-800a-abc6-d0690da3f955.png
DELETED
Git LFS Details
|
app/src/content/assets/image/newplot_2e11384e-bcac-8032-9835-e1407f4d780d.png
DELETED
Git LFS Details
|
app/src/content/assets/image/newplot_2e11384e-bcac-80a3-a6fa-e8634e0e2206.png
DELETED
Git LFS Details
|
app/src/content/assets/image/newplot_2e11384e-bcac-80bc-810d-d13554c628dc.png
DELETED
Git LFS Details
|
app/src/content/assets/image/newplot_2e11384e-bcac-80dd-972d-cf77d9c3b004.png
DELETED
Git LFS Details
|
app/src/content/assets/image/newplot_2e11384e-bcac-80ea-88cc-c971b2816596.png
DELETED
Git LFS Details
|
app/src/content/assets/image/newplot_2e21384e-bcac-80a2-9bac-c543304d926e.png
DELETED
Git LFS Details
|
app/src/content/assets/image/newplot_2e41384e-bcac-8065-b313-c38a6db4ac31.png
DELETED
Git LFS Details
|
app/src/content/assets/image/newplot_2e41384e-bcac-80c0-aef5-e71fdbaccd8d.png
DELETED
Git LFS Details
|
app/src/content/assets/image/newplot_2e71384e-bcac-8027-ae32-c133627ede4a.png
DELETED
Git LFS Details
|
app/src/content/assets/image/newplot_2ee1384e-bcac-80da-82cd-df97247e2e72.png
DELETED
Git LFS Details
|
app/src/content/assets/image/newplot_2f61384e-bcac-80d9-ab81-d57a228847cf.png
DELETED
Git LFS Details
|
app/src/content/assets/image/newplot_2f71384e-bcac-80c6-a99e-f52084fc497b.png
DELETED
Git LFS Details
|
app/src/content/assets/image/newplot_2f71384e-bcac-80d8-9985-e195d39f1e70.png
DELETED
Git LFS Details
|
app/src/content/chapters/experiments.mdx
CHANGED
|
@@ -1,29 +1,4 @@
|
|
| 1 |
-
import Image from "../../components/Image.astro";
|
| 2 |
import HtmlEmbed from "../../components/HtmlEmbed.astro";
|
| 3 |
-
import newplot_2c41384e_bcac_8073_9395_cf2d0e901187 from "../assets/image/newplot_2c41384e-bcac-8073-9395-cf2d0e901187.png";
|
| 4 |
-
import newplot_2c31384e_bcac_800b_82e8_ff44228f7720 from "../assets/image/newplot_2c31384e-bcac-800b-82e8-ff44228f7720.png";
|
| 5 |
-
import newplot_2e11384e_bcac_800a_abc6_d0690da3f955 from "../assets/image/newplot_2e11384e-bcac-800a-abc6-d0690da3f955.png";
|
| 6 |
-
import newplot_2e21384e_bcac_80a2_9bac_c543304d926e from "../assets/image/newplot_2e21384e-bcac-80a2-9bac-c543304d926e.png";
|
| 7 |
-
import newplot_2e11384e_bcac_80dd_972d_cf77d9c3b004 from "../assets/image/newplot_2e11384e-bcac-80dd-972d-cf77d9c3b004.png";
|
| 8 |
-
import newplot_2e11384e_bcac_80a3_a6fa_e8634e0e2206 from "../assets/image/newplot_2e11384e-bcac-80a3-a6fa-e8634e0e2206.png";
|
| 9 |
-
import newplot_2e41384e_bcac_80c0_aef5_e71fdbaccd8d from "../assets/image/newplot_2e41384e-bcac-80c0-aef5-e71fdbaccd8d.png";
|
| 10 |
-
import newplot_2da1384e_bcac_80d6_a8b9_da80324f8fef from "../assets/image/newplot_2da1384e-bcac-80d6-a8b9-da80324f8fef.png";
|
| 11 |
-
import newplot_2e71384e_bcac_8027_ae32_c133627ede4a from "../assets/image/newplot_2e71384e-bcac-8027-ae32-c133627ede4a.png";
|
| 12 |
-
import newplot_2f71384e_bcac_80c6_a99e_f52084fc497b from "../assets/image/newplot_2f71384e-bcac-80c6-a99e-f52084fc497b.png";
|
| 13 |
-
import newplot_2f71384e_bcac_80d8_9985_e195d39f1e70 from "../assets/image/newplot_2f71384e-bcac-80d8-9985-e195d39f1e70.png";
|
| 14 |
-
import newplot_2d21384e_bcac_80ab_a6dd_e31a6c150e61 from "../assets/image/newplot_2d21384e-bcac-80ab-a6dd-e31a6c150e61.png";
|
| 15 |
-
import newplot_2e11384e_bcac_80ea_88cc_c971b2816596 from "../assets/image/newplot_2e11384e-bcac-80ea-88cc-c971b2816596.png";
|
| 16 |
-
import newplot_2e11384e_bcac_8032_9835_e1407f4d780d from "../assets/image/newplot_2e11384e-bcac-8032-9835-e1407f4d780d.png";
|
| 17 |
-
import newplot_2df1384e_bcac_80bc_b93c_ee8e9cfd5529 from "../assets/image/newplot_2df1384e-bcac-80bc-b93c-ee8e9cfd5529.png";
|
| 18 |
-
import newplot_2df1384e_bcac_8018_b1f6_da1dcde1f90a from "../assets/image/newplot_2df1384e-bcac-8018-b1f6-da1dcde1f90a.png";
|
| 19 |
-
import newplot_2e01384e_bcac_8017_9829_cd0c1db928c6 from "../assets/image/newplot_2e01384e-bcac-8017-9829-cd0c1db928c6.png";
|
| 20 |
-
import newplot_2e01384e_bcac_806f_8bf1_f7e5405a2ff9 from "../assets/image/newplot_2e01384e-bcac-806f-8bf1-f7e5405a2ff9.png";
|
| 21 |
-
import newplot_2d61384e_bcac_8092_baca_c17346b95734 from "../assets/image/newplot_2d61384e-bcac-8092-baca-c17346b95734.png";
|
| 22 |
-
import newplot_2e41384e_bcac_8065_b313_c38a6db4ac31 from "../assets/image/newplot_2e41384e-bcac-8065-b313-c38a6db4ac31.png";
|
| 23 |
-
import newplot_2df1384e_bcac_8010_abe7_cf477262b8d6 from "../assets/image/newplot_2df1384e-bcac-8010-abe7-cf477262b8d6.png";
|
| 24 |
-
import newplot_2e11384e_bcac_80bc_810d_d13554c628dc from "../assets/image/newplot_2e11384e-bcac-80bc-810d-d13554c628dc.png";
|
| 25 |
-
import newplot_2f61384e_bcac_80d9_ab81_d57a228847cf from "../assets/image/newplot_2f61384e-bcac-80d9-ab81-d57a228847cf.png";
|
| 26 |
-
import newplot_2ee1384e_bcac_80da_82cd_df97247e2e72 from "../assets/image/newplot_2ee1384e-bcac-80da-82cd-df97247e2e72.png";
|
| 27 |
|
| 28 |
## Experiments
|
| 29 |
|
|
@@ -99,7 +74,26 @@ DCLM, REWIRE and Nemotron-HQ-Synth are the strongest baselines in our setup by a
|
|
| 99 |
|
| 100 |
Using gemma-3-1b, the prompt from REWIRE (guided_rewrite_original) is on-par with DCLM in our setup. Nemotron-HQ-Synth was created using five prompts: diverse_qa_pairs, extract_knowledge, distil, wikipedia_style_rephrasing and knowledge_list. The only prompt that really works well in our setup is diverse_qa_pairs. This is mainly due to very strong performance on SQUAD. We used fineweb-edu-hq as the source dataset for all prompts.
|
| 101 |
|
| 102 |
-
<
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
|
| 104 |
We see that dclm is a very strong baseline: apart from the diverse_qa_pairs prompt from the Nemotron-HQ-Synth dataset, no other open prior work outperforms dclm. Can we do better with different prompts?
|
| 105 |
|
|
@@ -107,7 +101,23 @@ We see that dclm is a very strong baseline: apart from the diverse_qa_pairs prom
|
|
| 107 |
|
| 108 |
We found four prompts that outperform both fw-edu-hq and the challenging dclm baseline: math, table, faq and tutorial.
|
| 109 |
|
| 110 |
-
<
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
|
| 112 |
For now we just used the Gemma-3-1b model, but can we do better by changing the rephrasing model?
|
| 113 |
|
|
@@ -119,11 +129,45 @@ In general, we want to know whether using a stronger model leads to better synth
|
|
| 119 |
|
| 120 |
We compare rephrasing with all Gemma-3 sizes (270m, 1b, 4b, 12b, 27b) using the tutorial prompt. We find that the 270m model underperforms but otherwise there is no significant difference.
|
| 121 |
|
| 122 |
-
<
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
|
| 124 |
Potentially, writing a tutorial is easy enough and we only need larger models for harder prompts such as Math. So we tested it there too, but find similar results with the 270m underperforming and no large difference between 1b, 4b, 12b and 27b.
|
| 125 |
|
| 126 |
-
<
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
|
| 128 |
TODO: also run this experiment for the REWIRE prompt since the original authors claim that larger models are necessary there
|
| 129 |
|
|
@@ -131,34 +175,140 @@ TODO: also run this experiment for the REWIRE prompt since the original authors
|
|
| 131 |
The [REWIRE](https://arxiv.org/abs/2506.04689) paper claims that for upcycling low quality data we need large models (Llama-3.3 70B in their case). Is this true?
|
| 132 |
Continue prompt: For the 1b model the source data does not seem to matter, but the 12b model can make use of the hq data better.
|
| 133 |
|
| 134 |
-
<
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 135 |
|
| 136 |
Tutorial prompt: For the hq data the model size does not seem to matter whereas for the lq data the larger model is slightly better.
|
| 137 |
|
| 138 |
-
<
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 139 |
|
| 140 |
FAQ prompt: Surprisingly, the 1b model is better for both lq and hq data.
|
| 141 |
|
| 142 |
-
<
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 143 |
|
| 144 |
In general we cannot reproduce REWIRE's claim that large models are needed for lq data. Overall we rarely see benefits of using models larger than 1b. So as long as the model has some baseline level (in our experiments already reached at the 1b scale) we see no evidence for a clear benefit of using larger models for rephrasing. For these reasons we default to the 1b size for maximum throughput from here on. We hypothesize that most rephrasing tasks are simple enough for smaller models to handle sufficiently well.
|
| 145 |
#### Does the model family matter?
|
| 146 |
|
| 147 |
Some model families may be better suited for rephrasing than others based on their training data. This is why we test top families at the 1B scale on the four top-performing prompts tutorial, faq, table, math. We find that for the tutorial prompt at the 1B scale Llama-3.2, Granite-3, Gemma-3, and Qwen3 and Falcon3 perform roughly at the same level. SmolLM2 clearly outperforms.
|
| 148 |
|
| 149 |
-
<
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 150 |
|
| 151 |
In the faq prompt SmolLM2 again clearly outperforms the others. Here Qwen3 underperforms.
|
| 152 |
|
| 153 |
-
<
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
|
| 155 |
For the table prompt we again see SmolLM2 and to some degree Falcon3 outperform.
|
| 156 |
|
| 157 |
-
<
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 158 |
|
| 159 |
Finally, math is again a clear win for SmolLM2 with Qwen3 underperforming.
|
| 160 |
|
| 161 |
-
<
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 162 |
|
| 163 |
We hypothesize that the consistently strong performance of SmolLM2 originates from [rewrite tasks](https://huggingface.co/datasets/HuggingFaceTB/smoltalk/viewer/smol-rewrite?row=0&views%5B%5D=smol_rewrite_train) in the training data.
|
| 164 |
So the model family clearly seems to matter. However, SmolLM2 is already a year old. Are newer models better than older ones?
|
|
@@ -166,7 +316,23 @@ So the model family clearly seems to matter. However, SmolLM2 is already a year
|
|
| 166 |
|
| 167 |
We compare rephrasing with Qwen models from versions 1.5, 2, 2.5 and 3 using the tutorial prompt, one of the prompts that outperformed the DCLM baseline. While the differences are small we find a trend that newer versions lead to higher evaluation performance.
|
| 168 |
|
| 169 |
-
<
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
|
| 171 |
So now we know that certain models are better than others, newer models tend to outperform older models and usually rephrasing models can be as small as 1B parameters. What difference do the dataset choices make?
|
| 172 |
|
|
@@ -176,11 +342,47 @@ So now we know that certain models are better than others, newer models tend to
|
|
| 176 |
|
| 177 |
To test the effect of the mix-in dataset we apply the tutorial prompt using Gemma-3-1b on fw_edu_hq and mix in dclm, cosmopedia, fw_edu_hq and fw_edu_lq. We find that the mix-in dataset makes a substantial difference, with cosmopedia and fw_edu_lq underperforming dclm and fw_edu_hq. fw_edu_hq and dclm achieve very similar accuracy even though dclm is much better by itself. We see that mixing in the synthetic data improves performance for all mix-in datasets. The effect is more pronounced for the worse datasets fw_edu_lq and cosmopedia.
|
| 178 |
|
| 179 |
-
<
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 180 |
|
| 181 |
Does this trend hold for other source datasets? We ran the experiment for fw_edu_lq as source and find similar results: fw_edu_hq and dclm outperform both cosmopedia and fw_edu_lq. For all mix-in datasets except dclm, adding synthetic data is beneficial.
|
| 182 |
|
| 183 |
-
<
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 184 |
|
| 185 |
So we know that the mix-in dataset plays a large role. What about the source dataset used for rephrasing?
|
| 186 |
|
|
@@ -188,23 +390,111 @@ So we know that the mix-in dataset plays a large role. What about the source dat
|
|
| 188 |
|
| 189 |
To investigate to what extent the source dataset for rephrasing matters we rephrased dclm, cosmopedia, fw_edu_hq and fw_edu_lq using the Gemma-3-1B model and the tutorial and faq prompts. When we mix in the source dataset with the rephrased data we find fw_edu_hq and dclm clearly outperforming fw_edu_lq and cosmopedia for both prompts.
|
| 190 |
|
| 191 |
-
<
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 192 |
|
| 193 |
-
<
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 194 |
|
| 195 |
When fix the mix-in dataset to fw_edu_hq, the difference shrinks drastically for the tutorial prompt and even more for the faq prompt. This corroborates our finding that the mix-in datasets seem to matter much more than the source rephrasing datasets.
|
| 196 |
|
| 197 |
-
<
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 198 |
|
| 199 |
-
<
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 200 |
|
| 201 |
#### Is synthetic data enough?
|
| 202 |
|
| 203 |
We were wondering whether just training on synthetic data works. While we get increased performance over fw-edu-hq, it does not match the original dataset performance (DCLM) and also is clearly below the performance of the original dataset mixed with the rephrased one for both the tutorial and faq prompts. We get the same result when we rephrase fw_edu_hq instead of dclm.
|
| 204 |
|
| 205 |
-
<
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 206 |
|
| 207 |
-
<
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 208 |
|
| 209 |
#### Does increased diversity help?
|
| 210 |
|
|
@@ -212,18 +502,71 @@ There are multiple ways of increasing diversity. We can think of mixing rephrasi
|
|
| 212 |
**Mixing rephrasing approaches**
|
| 213 |
We were wondering whether mixing the best performing rephrasing approaches can improve over the individual approaches. We find no significant increase over the best performing approach (mix-fw_edu_hq-math_1b_hq). It seems that when we mix together enough different prompts (mix-tutorial_1b_hq-faq_1b_hq-table_1b_hq-math_1b_hq), we don't necessarily need the source dataset (fw_edu_hq) for good performance. This could mean that when just training on one synthetic dataset we need the original dataset for diversity, but when we mix multiple ones it is not necessary. However, it does not hurt and is an easy way of increasing the dataset size while keeping the performance high. To follow up it would be interesting to study with how little synthetic data we can get away with without performance drops.
|
| 214 |
|
| 215 |
-
<
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 216 |
|
| 217 |
**Mixing model families**
|
| 218 |
We rephrased using different model families and saw SmolLM2 and Falcon3 clearly outperform Llama3.2 and Granite3. Now we wonder whether mixing the rephrased outputs of multiple models improves performance through increased diversity.
|
| 219 |
|
| 220 |
-
<
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 221 |
|
| 222 |
It turns out that benchmark performance does not improve through increased rephrasing model diversity but is largely an average of the mixed datasets performance (smollm2 and falcon3 are similar to just smollm2, smollm2 and llama3.2 lie in between smollm2 and llama3.2, llama3.2 and granite3 are similar to just llama3.2).
|
| 223 |
**Mixing both rephrasing approaches and model families**
|
| 224 |
Maybe we need more diversity by mixing both rephrasing approaches and model families?
|
| 225 |
|
| 226 |
-
<
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 227 |
|
| 228 |
No, we get the same results as for just mixing rephrasing approaches or model families independently: the mix is around the average performance instead of resulting in a gain.
|
| 229 |
|
|
@@ -231,7 +574,23 @@ No, we get the same results as for just mixing rephrasing approaches or model fa
|
|
| 231 |
|
| 232 |
The original REWIRE prompt contains many typos and grammar errors. To what extent do typos in the prompt hurt performance?
|
| 233 |
|
| 234 |
-
<
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 235 |
|
| 236 |
Surprisingly, typos don't have a negative effect on downstream model performance. For the 1b model, even the opposite is the case.
|
| 237 |
|
|
|
|
|
|
|
| 1 |
import HtmlEmbed from "../../components/HtmlEmbed.astro";
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
## Experiments
|
| 4 |
|
|
|
|
| 74 |
|
| 75 |
Using gemma-3-1b, the prompt from REWIRE (guided_rewrite_original) is on-par with DCLM in our setup. Nemotron-HQ-Synth was created using five prompts: diverse_qa_pairs, extract_knowledge, distil, wikipedia_style_rephrasing and knowledge_list. The only prompt that really works well in our setup is diverse_qa_pairs. This is mainly due to very strong performance on SQUAD. We used fineweb-edu-hq as the source dataset for all prompts.
|
| 76 |
|
| 77 |
+
<HtmlEmbed
|
| 78 |
+
id="dissecting-baselines"
|
| 79 |
+
src="d3-benchmark-comparison.html"
|
| 80 |
+
title="Dissecting Synthetic Baselines"
|
| 81 |
+
desc="Figure: Individual prompt performance from existing synthetic datasets compared to DCLM and FineWeb-Edu (HQ)."
|
| 82 |
+
config={{
|
| 83 |
+
defaultView: "line",
|
| 84 |
+
datasetNames: {
|
| 85 |
+
"mix-fw_edu_hq-diverse_qa_pairs_1b_hq": "Diverse QA Pairs",
|
| 86 |
+
dclm: "DCLM",
|
| 87 |
+
"mix-fw_edu_hq-extract_knowledge_1b_hq": "Extract Knowledge",
|
| 88 |
+
"mix-fw_edu_hq-guided_rewrite_original_1b_hq": "Guided Rewrite (REWIRE)",
|
| 89 |
+
nemotron_hq_synth: "Nemotron-HQ-Synth",
|
| 90 |
+
"mix-fw_edu_hq-distill_1b_hq": "Distill",
|
| 91 |
+
"mix-fw_edu_hq-wikipedia_style_rephrasing_1b_hq": "Wikipedia Rephrasing",
|
| 92 |
+
"mix-fw_edu_hq-knowledge_list_1b_hq": "Knowledge List",
|
| 93 |
+
fw_edu_hq: "FineWeb-Edu (HQ)"
|
| 94 |
+
}
|
| 95 |
+
}}
|
| 96 |
+
/>
|
| 97 |
|
| 98 |
We see that dclm is a very strong baseline: apart from the diverse_qa_pairs prompt from the Nemotron-HQ-Synth dataset, no other open prior work outperforms dclm. Can we do better with different prompts?
|
| 99 |
|
|
|
|
| 101 |
|
| 102 |
We found four prompts that outperform both fw-edu-hq and the challenging dclm baseline: math, table, faq and tutorial.
|
| 103 |
|
| 104 |
+
<HtmlEmbed
|
| 105 |
+
id="new-prompts"
|
| 106 |
+
src="d3-benchmark-comparison.html"
|
| 107 |
+
title="New Prompt Performance"
|
| 108 |
+
desc="Figure: Four new prompts (math, table, faq, tutorial) compared against DCLM and FineWeb-Edu (HQ)."
|
| 109 |
+
config={{
|
| 110 |
+
defaultView: "line",
|
| 111 |
+
datasetNames: {
|
| 112 |
+
"mix-fw_edu_hq-math_1b_hq": "Math",
|
| 113 |
+
"mix-fw_edu_hq-table_1b_hq": "Table",
|
| 114 |
+
"mix-fw_edu_hq-faq_1b_hq": "FAQ",
|
| 115 |
+
"mix-fw_edu_hq-tutorial_1b_hq": "Tutorial",
|
| 116 |
+
dclm: "DCLM",
|
| 117 |
+
fw_edu_hq: "FineWeb-Edu (HQ)"
|
| 118 |
+
}
|
| 119 |
+
}}
|
| 120 |
+
/>
|
| 121 |
|
| 122 |
For now we just used the Gemma-3-1b model, but can we do better by changing the rephrasing model?
|
| 123 |
|
|
|
|
| 129 |
|
| 130 |
We compare rephrasing with all Gemma-3 sizes (270m, 1b, 4b, 12b, 27b) using the tutorial prompt. We find that the 270m model underperforms but otherwise there is no significant difference.
|
| 131 |
|
| 132 |
+
<HtmlEmbed
|
| 133 |
+
id="model-size-tutorial"
|
| 134 |
+
src="d3-benchmark-comparison.html"
|
| 135 |
+
title="Model Size: Tutorial Prompt"
|
| 136 |
+
desc="Figure: Gemma-3 model sizes (270M to 27B) on the tutorial prompt."
|
| 137 |
+
config={{
|
| 138 |
+
defaultView: "line",
|
| 139 |
+
datasetNames: {
|
| 140 |
+
"mix-fw_edu_hq-tutorial_27b_hq": "Gemma-3 27B",
|
| 141 |
+
"mix-fw_edu_hq-tutorial_12b_hq": "Gemma-3 12B",
|
| 142 |
+
"mix-fw_edu_hq-tutorial_4b_hq": "Gemma-3 4B",
|
| 143 |
+
"mix-fw_edu_hq-tutorial_1b_hq": "Gemma-3 1B",
|
| 144 |
+
"mix-fw_edu_hq-tutorial_270m_hq": "Gemma-3 270M",
|
| 145 |
+
dclm: "DCLM",
|
| 146 |
+
fw_edu_hq: "FineWeb-Edu (HQ)"
|
| 147 |
+
}
|
| 148 |
+
}}
|
| 149 |
+
/>
|
| 150 |
|
| 151 |
Potentially, writing a tutorial is easy enough and we only need larger models for harder prompts such as Math. So we tested it there too, but find similar results with the 270m underperforming and no large difference between 1b, 4b, 12b and 27b.
|
| 152 |
|
| 153 |
+
<HtmlEmbed
|
| 154 |
+
id="model-size-math"
|
| 155 |
+
src="d3-benchmark-comparison.html"
|
| 156 |
+
title="Model Size: Math Prompt"
|
| 157 |
+
desc="Figure: Gemma-3 model sizes (270M to 27B) on the math prompt."
|
| 158 |
+
config={{
|
| 159 |
+
defaultView: "line",
|
| 160 |
+
datasetNames: {
|
| 161 |
+
"mix-fw_edu_hq-math_1b_hq": "Gemma-3 1B",
|
| 162 |
+
"mix-fw_edu_hq-math_4b_hq": "Gemma-3 4B",
|
| 163 |
+
"mix-fw_edu_hq-math_27b_hq": "Gemma-3 27B",
|
| 164 |
+
"mix-fw_edu_hq-math_12b_hq": "Gemma-3 12B",
|
| 165 |
+
"mix-fw_edu_hq-math_270m_hq": "Gemma-3 270M",
|
| 166 |
+
dclm: "DCLM",
|
| 167 |
+
fw_edu_hq: "FineWeb-Edu (HQ)"
|
| 168 |
+
}
|
| 169 |
+
}}
|
| 170 |
+
/>
|
| 171 |
|
| 172 |
TODO: also run this experiment for the REWIRE prompt since the original authors claim that larger models are necessary there
|
| 173 |
|
|
|
|
| 175 |
The [REWIRE](https://arxiv.org/abs/2506.04689) paper claims that for upcycling low quality data we need large models (Llama-3.3 70B in their case). Is this true?
|
| 176 |
Continue prompt: For the 1b model the source data does not seem to matter, but the 12b model can make use of the hq data better.
|
| 177 |
|
| 178 |
+
<HtmlEmbed
|
| 179 |
+
id="size-quality-continue"
|
| 180 |
+
src="d3-benchmark-comparison.html"
|
| 181 |
+
title="Model Size vs Data Quality: Continue Prompt"
|
| 182 |
+
desc="Figure: 1B vs 12B model on HQ vs LQ data using the continue prompt."
|
| 183 |
+
config={{
|
| 184 |
+
defaultView: "line",
|
| 185 |
+
datasetNames: {
|
| 186 |
+
"mix-fw_edu_hq-continue_12b_hq": "12B, HQ Source",
|
| 187 |
+
"mix-fw_edu_hq-continue_1b_hq": "1B, HQ Source",
|
| 188 |
+
"mix-fw_edu_hq-continue_1b_lq": "1B, LQ Source",
|
| 189 |
+
"mix-fw_edu_hq-continue_12b_lq": "12B, LQ Source"
|
| 190 |
+
}
|
| 191 |
+
}}
|
| 192 |
+
/>
|
| 193 |
|
| 194 |
Tutorial prompt: For the hq data the model size does not seem to matter whereas for the lq data the larger model is slightly better.
|
| 195 |
|
| 196 |
+
<HtmlEmbed
|
| 197 |
+
id="size-quality-tutorial"
|
| 198 |
+
src="d3-benchmark-comparison.html"
|
| 199 |
+
title="Model Size vs Data Quality: Tutorial Prompt"
|
| 200 |
+
desc="Figure: 1B vs 12B model on HQ vs LQ data using the tutorial prompt."
|
| 201 |
+
config={{
|
| 202 |
+
defaultView: "line",
|
| 203 |
+
datasetNames: {
|
| 204 |
+
"mix-fw_edu_hq-tutorial_1b_hq": "1B, HQ Source",
|
| 205 |
+
"mix-fw_edu_hq-tutorial_12b_hq": "12B, HQ Source",
|
| 206 |
+
"mix-fw_edu_hq-tutorial_12b_lq": "12B, LQ Source",
|
| 207 |
+
"mix-fw_edu_hq-tutorial_1b_lq": "1B, LQ Source"
|
| 208 |
+
}
|
| 209 |
+
}}
|
| 210 |
+
/>
|
| 211 |
|
| 212 |
FAQ prompt: Surprisingly, the 1b model is better for both lq and hq data.
|
| 213 |
|
| 214 |
+
<HtmlEmbed
|
| 215 |
+
id="size-quality-faq"
|
| 216 |
+
src="d3-benchmark-comparison.html"
|
| 217 |
+
title="Model Size vs Data Quality: FAQ Prompt"
|
| 218 |
+
desc="Figure: 1B vs 12B model on HQ vs LQ data using the FAQ prompt."
|
| 219 |
+
config={{
|
| 220 |
+
defaultView: "line",
|
| 221 |
+
datasetNames: {
|
| 222 |
+
"mix-fw_edu_hq-faq_1b_hq": "1B, HQ Source",
|
| 223 |
+
"mix-fw_edu_hq-faq_1b_lq": "1B, LQ Source",
|
| 224 |
+
"mix-fw_edu_hq-faq_12b_hq": "12B, HQ Source",
|
| 225 |
+
"mix-fw_edu_hq-faq_12b_lq": "12B, LQ Source"
|
| 226 |
+
}
|
| 227 |
+
}}
|
| 228 |
+
/>
|
| 229 |
|
| 230 |
In general we cannot reproduce REWIRE's claim that large models are needed for lq data. Overall we rarely see benefits of using models larger than 1b. So as long as the model has some baseline level (in our experiments already reached at the 1b scale) we see no evidence for a clear benefit of using larger models for rephrasing. For these reasons we default to the 1b size for maximum throughput from here on. We hypothesize that most rephrasing tasks are simple enough for smaller models to handle sufficiently well.
|
| 231 |
#### Does the model family matter?
|
| 232 |
|
| 233 |
Some model families may be better suited for rephrasing than others based on their training data. This is why we test top families at the 1B scale on the four top-performing prompts tutorial, faq, table, math. We find that for the tutorial prompt at the 1B scale Llama-3.2, Granite-3, Gemma-3, and Qwen3 and Falcon3 perform roughly at the same level. SmolLM2 clearly outperforms.
|
| 234 |
|
| 235 |
+
<HtmlEmbed
|
| 236 |
+
id="model-family-tutorial"
|
| 237 |
+
src="d3-benchmark-comparison.html"
|
| 238 |
+
title="Model Family: Tutorial Prompt"
|
| 239 |
+
desc="Figure: Model families compared on the tutorial prompt at ~1B scale."
|
| 240 |
+
config={{
|
| 241 |
+
defaultView: "line",
|
| 242 |
+
datasetNames: {
|
| 243 |
+
"mix-fw_edu_hq-tutorial_smollm2_1.7b_hq": "SmolLM2",
|
| 244 |
+
"mix-fw_edu_hq-tutorial_falcon3_1b_hq": "Falcon3",
|
| 245 |
+
"mix-fw_edu_hq-tutorial_qwen3_1.7b_hq": "Qwen3",
|
| 246 |
+
"mix-fw_edu_hq-tutorial_1b_hq": "Gemma-3",
|
| 247 |
+
"mix-fw_edu_hq-tutorial_granite3_1b_hq": "Granite3",
|
| 248 |
+
"mix-fw_edu_hq-tutorial_llama3.2_1b_hq": "Llama-3.2"
|
| 249 |
+
}
|
| 250 |
+
}}
|
| 251 |
+
/>
|
| 252 |
|
| 253 |
In the faq prompt SmolLM2 again clearly outperforms the others. Here Qwen3 underperforms.
|
| 254 |
|
| 255 |
+
<HtmlEmbed
|
| 256 |
+
id="model-family-faq"
|
| 257 |
+
src="d3-benchmark-comparison.html"
|
| 258 |
+
title="Model Family: FAQ Prompt"
|
| 259 |
+
desc="Figure: Model families compared on the FAQ prompt at ~1B scale."
|
| 260 |
+
config={{
|
| 261 |
+
defaultView: "line",
|
| 262 |
+
datasetNames: {
|
| 263 |
+
"mix-fw_edu_hq-faq_smollm2_1.7b_hq": "SmolLM2",
|
| 264 |
+
"mix-fw_edu_hq-faq_llama3.2_1b_hq": "Llama-3.2",
|
| 265 |
+
"mix-fw_edu_hq-faq_falcon3_1b_hq": "Falcon3",
|
| 266 |
+
"mix-fw_edu_hq-faq_1b_hq": "Gemma-3",
|
| 267 |
+
"mix-fw_edu_hq-faq_granite3_1b_hq": "Granite3",
|
| 268 |
+
"mix-fw_edu_hq-faq_qwen3_1.7b_hq": "Qwen3"
|
| 269 |
+
}
|
| 270 |
+
}}
|
| 271 |
+
/>
|
| 272 |
|
| 273 |
For the table prompt we again see SmolLM2 and to some degree Falcon3 outperform.
|
| 274 |
|
| 275 |
+
<HtmlEmbed
|
| 276 |
+
id="model-family-table"
|
| 277 |
+
src="d3-benchmark-comparison.html"
|
| 278 |
+
title="Model Family: Table Prompt"
|
| 279 |
+
desc="Figure: Model families compared on the table prompt at ~1B scale."
|
| 280 |
+
config={{
|
| 281 |
+
defaultView: "line",
|
| 282 |
+
datasetNames: {
|
| 283 |
+
"mix-fw_edu_hq-table_smollm2_1.7b_hq": "SmolLM2",
|
| 284 |
+
"mix-fw_edu_hq-table_falcon3_1b_hq": "Falcon3",
|
| 285 |
+
"mix-fw_edu_hq-table_granite3_1b_hq": "Granite3",
|
| 286 |
+
"mix-fw_edu_hq-table_qwen3_1.7b_hq": "Qwen3",
|
| 287 |
+
"mix-fw_edu_hq-table_llama3.2_1b_hq": "Llama-3.2",
|
| 288 |
+
"mix-fw_edu_hq-table_1b_hq": "Gemma-3"
|
| 289 |
+
}
|
| 290 |
+
}}
|
| 291 |
+
/>
|
| 292 |
|
| 293 |
Finally, math is again a clear win for SmolLM2 with Qwen3 underperforming.
|
| 294 |
|
| 295 |
+
<HtmlEmbed
|
| 296 |
+
id="model-family-math"
|
| 297 |
+
src="d3-benchmark-comparison.html"
|
| 298 |
+
title="Model Family: Math Prompt"
|
| 299 |
+
desc="Figure: Model families compared on the math prompt at ~1B scale."
|
| 300 |
+
config={{
|
| 301 |
+
defaultView: "line",
|
| 302 |
+
datasetNames: {
|
| 303 |
+
"mix-fw_edu_hq-math_smollm2_1.7b_hq": "SmolLM2",
|
| 304 |
+
"mix-fw_edu_hq-math_falcon3_1b_hq": "Falcon3",
|
| 305 |
+
"mix-fw_edu_hq-math_granite3_1b_hq": "Granite3",
|
| 306 |
+
"mix-fw_edu_hq-math_1b_hq": "Gemma-3",
|
| 307 |
+
"mix-fw_edu_hq-math_llama3.2_1b_hq": "Llama-3.2",
|
| 308 |
+
"mix-fw_edu_hq-math_qwen3_1.7b_hq": "Qwen3"
|
| 309 |
+
}
|
| 310 |
+
}}
|
| 311 |
+
/>
|
| 312 |
|
| 313 |
We hypothesize that the consistently strong performance of SmolLM2 originates from [rewrite tasks](https://huggingface.co/datasets/HuggingFaceTB/smoltalk/viewer/smol-rewrite?row=0&views%5B%5D=smol_rewrite_train) in the training data.
|
| 314 |
So the model family clearly seems to matter. However, SmolLM2 is already a year old. Are newer models better than older ones?
|
|
|
|
| 316 |
|
| 317 |
We compare rephrasing with Qwen models from versions 1.5, 2, 2.5 and 3 using the tutorial prompt, one of the prompts that outperformed the DCLM baseline. While the differences are small we find a trend that newer versions lead to higher evaluation performance.
|
| 318 |
|
| 319 |
+
<HtmlEmbed
|
| 320 |
+
id="model-generation"
|
| 321 |
+
src="d3-benchmark-comparison.html"
|
| 322 |
+
title="Model Generation: Qwen Tutorial"
|
| 323 |
+
desc="Figure: Qwen model generations (1.5 to 3) on the tutorial prompt."
|
| 324 |
+
config={{
|
| 325 |
+
defaultView: "line",
|
| 326 |
+
datasetNames: {
|
| 327 |
+
"mix-fw_edu_hq-tutorial_qwen3_1.7b_hq": "Qwen3 (1.7B)",
|
| 328 |
+
"mix-fw_edu_hq-tutorial_qwen2.5_1.5b_hq": "Qwen2.5 (1.5B)",
|
| 329 |
+
"mix-fw_edu_hq-tutorial_qwen2_1.5b_hq": "Qwen2 (1.5B)",
|
| 330 |
+
dclm: "DCLM",
|
| 331 |
+
"mix-fw_edu_hq-tutorial_qwen1.5_1.8b_hq": "Qwen1.5 (1.8B)",
|
| 332 |
+
fw_edu_hq: "FineWeb-Edu (HQ)"
|
| 333 |
+
}
|
| 334 |
+
}}
|
| 335 |
+
/>
|
| 336 |
|
| 337 |
So now we know that certain models are better than others, newer models tend to outperform older models and usually rephrasing models can be as small as 1B parameters. What difference do the dataset choices make?
|
| 338 |
|
|
|
|
| 342 |
|
| 343 |
To test the effect of the mix-in dataset we apply the tutorial prompt using Gemma-3-1b on fw_edu_hq and mix in dclm, cosmopedia, fw_edu_hq and fw_edu_lq. We find that the mix-in dataset makes a substantial difference, with cosmopedia and fw_edu_lq underperforming dclm and fw_edu_hq. fw_edu_hq and dclm achieve very similar accuracy even though dclm is much better by itself. We see that mixing in the synthetic data improves performance for all mix-in datasets. The effect is more pronounced for the worse datasets fw_edu_lq and cosmopedia.
|
| 344 |
|
| 345 |
+
<HtmlEmbed
|
| 346 |
+
id="mixin-dataset-hq-source"
|
| 347 |
+
src="d3-benchmark-comparison.html"
|
| 348 |
+
title="Mix-in Dataset Effect (HQ Source)"
|
| 349 |
+
desc="Figure: Effect of different mix-in datasets with fw_edu_hq as source for the tutorial prompt."
|
| 350 |
+
config={{
|
| 351 |
+
defaultView: "line",
|
| 352 |
+
datasetNames: {
|
| 353 |
+
"mix-dclm-tutorial_1b_hq": "Mix-in: DCLM",
|
| 354 |
+
"mix-fw_edu_hq-tutorial_1b_hq": "Mix-in: FW-Edu (HQ)",
|
| 355 |
+
dclm: "DCLM",
|
| 356 |
+
"mix-fw_edu_lq-tutorial_1b_hq": "Mix-in: FW-Edu (LQ)",
|
| 357 |
+
"mix-cosmopedia-tutorial_1b_hq": "Mix-in: Cosmopedia",
|
| 358 |
+
fw_edu_hq: "FineWeb-Edu (HQ)",
|
| 359 |
+
cosmopedia: "Cosmopedia",
|
| 360 |
+
fw_edu_lq: "FineWeb-Edu (LQ)"
|
| 361 |
+
}
|
| 362 |
+
}}
|
| 363 |
+
/>
|
| 364 |
|
| 365 |
Does this trend hold for other source datasets? We ran the experiment for fw_edu_lq as source and find similar results: fw_edu_hq and dclm outperform both cosmopedia and fw_edu_lq. For all mix-in datasets except dclm, adding synthetic data is beneficial.
|
| 366 |
|
| 367 |
+
<HtmlEmbed
|
| 368 |
+
id="mixin-dataset-lq-source"
|
| 369 |
+
src="d3-benchmark-comparison.html"
|
| 370 |
+
title="Mix-in Dataset Effect (LQ Source)"
|
| 371 |
+
desc="Figure: Effect of different mix-in datasets with fw_edu_lq as source for the tutorial prompt."
|
| 372 |
+
config={{
|
| 373 |
+
defaultView: "line",
|
| 374 |
+
datasetNames: {
|
| 375 |
+
dclm: "DCLM",
|
| 376 |
+
"mix-fw_edu_hq-tutorial_1b_lq": "Mix-in: FW-Edu (HQ)",
|
| 377 |
+
"mix-dclm-tutorial_1b_lq": "Mix-in: DCLM",
|
| 378 |
+
fw_edu_hq: "FineWeb-Edu (HQ)",
|
| 379 |
+
"mix-cosmopedia-tutorial_1b_lq": "Mix-in: Cosmopedia",
|
| 380 |
+
cosmopedia: "Cosmopedia",
|
| 381 |
+
"mix-fw_edu_lq-tutorial_1b_lq": "Mix-in: FW-Edu (LQ)",
|
| 382 |
+
fw_edu_lq: "FineWeb-Edu (LQ)"
|
| 383 |
+
}
|
| 384 |
+
}}
|
| 385 |
+
/>
|
| 386 |
|
| 387 |
So we know that the mix-in dataset plays a large role. What about the source dataset used for rephrasing?
|
| 388 |
|
|
|
|
| 390 |
|
| 391 |
To investigate to what extent the source dataset for rephrasing matters we rephrased dclm, cosmopedia, fw_edu_hq and fw_edu_lq using the Gemma-3-1B model and the tutorial and faq prompts. When we mix in the source dataset with the rephrased data we find fw_edu_hq and dclm clearly outperforming fw_edu_lq and cosmopedia for both prompts.
|
| 392 |
|
| 393 |
+
<HtmlEmbed
|
| 394 |
+
id="source-dataset-tutorial"
|
| 395 |
+
src="d3-benchmark-comparison.html"
|
| 396 |
+
title="Source Dataset: Tutorial (Mix-in = Source)"
|
| 397 |
+
desc="Figure: Effect of source dataset choice for the tutorial prompt when mix-in equals source."
|
| 398 |
+
config={{
|
| 399 |
+
defaultView: "line",
|
| 400 |
+
datasetNames: {
|
| 401 |
+
"mix-fw_edu_hq-tutorial_1b_hq": "Source: FW-Edu (HQ)",
|
| 402 |
+
"mix-dclm-tutorial_1b_dclm": "Source: DCLM",
|
| 403 |
+
"mix-cosmopedia-tutorial_1b_cosmopedia": "Source: Cosmopedia",
|
| 404 |
+
"mix-fw_edu_lq-tutorial_1b_lq": "Source: FW-Edu (LQ)"
|
| 405 |
+
}
|
| 406 |
+
}}
|
| 407 |
+
/>
|
| 408 |
|
| 409 |
+
<HtmlEmbed
|
| 410 |
+
id="source-dataset-faq"
|
| 411 |
+
src="d3-benchmark-comparison.html"
|
| 412 |
+
title="Source Dataset: FAQ (Mix-in = Source)"
|
| 413 |
+
desc="Figure: Effect of source dataset choice for the FAQ prompt when mix-in equals source."
|
| 414 |
+
config={{
|
| 415 |
+
defaultView: "line",
|
| 416 |
+
datasetNames: {
|
| 417 |
+
"mix-dclm-faq_1b_dclm": "Source: DCLM",
|
| 418 |
+
"mix-fw_edu_hq-faq_1b_hq": "Source: FW-Edu (HQ)",
|
| 419 |
+
"mix-fw_edu_lq-faq_1b_lq": "Source: FW-Edu (LQ)",
|
| 420 |
+
"mix-cosmopedia-faq_1b_cosmopedia": "Source: Cosmopedia"
|
| 421 |
+
}
|
| 422 |
+
}}
|
| 423 |
+
/>
|
| 424 |
|
| 425 |
When fix the mix-in dataset to fw_edu_hq, the difference shrinks drastically for the tutorial prompt and even more for the faq prompt. This corroborates our finding that the mix-in datasets seem to matter much more than the source rephrasing datasets.
|
| 426 |
|
| 427 |
+
<HtmlEmbed
|
| 428 |
+
id="source-dataset-fixed-mixin-tutorial"
|
| 429 |
+
src="d3-benchmark-comparison.html"
|
| 430 |
+
title="Source Dataset: Tutorial (Fixed Mix-in: FW-Edu HQ)"
|
| 431 |
+
desc="Figure: Effect of source dataset for the tutorial prompt with fw_edu_hq as fixed mix-in."
|
| 432 |
+
config={{
|
| 433 |
+
defaultView: "line",
|
| 434 |
+
datasetNames: {
|
| 435 |
+
"mix-fw_edu_hq-tutorial_1b_dclm": "Source: DCLM",
|
| 436 |
+
"mix-fw_edu_hq-tutorial_1b_hq": "Source: FW-Edu (HQ)",
|
| 437 |
+
"mix-fw_edu_hq-tutorial_1b_cosmopedia": "Source: Cosmopedia",
|
| 438 |
+
"mix-fw_edu_hq-tutorial_1b_lq": "Source: FW-Edu (LQ)"
|
| 439 |
+
}
|
| 440 |
+
}}
|
| 441 |
+
/>
|
| 442 |
|
| 443 |
+
<HtmlEmbed
|
| 444 |
+
id="source-dataset-fixed-mixin-faq"
|
| 445 |
+
src="d3-benchmark-comparison.html"
|
| 446 |
+
title="Source Dataset: FAQ (Fixed Mix-in: FW-Edu HQ)"
|
| 447 |
+
desc="Figure: Effect of source dataset for the FAQ prompt with fw_edu_hq as fixed mix-in."
|
| 448 |
+
config={{
|
| 449 |
+
defaultView: "line",
|
| 450 |
+
datasetNames: {
|
| 451 |
+
"mix-fw_edu_hq-faq_1b_dclm": "Source: DCLM",
|
| 452 |
+
"mix-fw_edu_hq-faq_1b_hq": "Source: FW-Edu (HQ)",
|
| 453 |
+
"mix-fw_edu_hq-faq_1b_lq": "Source: FW-Edu (LQ)",
|
| 454 |
+
"mix-fw_edu_hq-faq_1b_cosmopedia": "Source: Cosmopedia"
|
| 455 |
+
}
|
| 456 |
+
}}
|
| 457 |
+
/>
|
| 458 |
|
| 459 |
#### Is synthetic data enough?
|
| 460 |
|
| 461 |
We were wondering whether just training on synthetic data works. While we get increased performance over fw-edu-hq, it does not match the original dataset performance (DCLM) and also is clearly below the performance of the original dataset mixed with the rephrased one for both the tutorial and faq prompts. We get the same result when we rephrase fw_edu_hq instead of dclm.
|
| 462 |
|
| 463 |
+
<HtmlEmbed
|
| 464 |
+
id="synthetic-only-dclm"
|
| 465 |
+
src="d3-benchmark-comparison.html"
|
| 466 |
+
title="Is Synthetic Data Enough? (DCLM Source)"
|
| 467 |
+
desc="Figure: Synthetic-only vs mixed training with DCLM as source."
|
| 468 |
+
config={{
|
| 469 |
+
defaultView: "line",
|
| 470 |
+
datasetNames: {
|
| 471 |
+
"mix-dclm-faq_1b_dclm": "Mix: FAQ + DCLM",
|
| 472 |
+
dclm: "DCLM",
|
| 473 |
+
"mix-dclm-tutorial_1b_dclm": "Mix: Tutorial + DCLM",
|
| 474 |
+
faq_1b_dclm: "FAQ Only",
|
| 475 |
+
tutorial_1b_dclm: "Tutorial Only",
|
| 476 |
+
fw_edu_hq: "FineWeb-Edu (HQ)"
|
| 477 |
+
}
|
| 478 |
+
}}
|
| 479 |
+
/>
|
| 480 |
|
| 481 |
+
<HtmlEmbed
|
| 482 |
+
id="synthetic-only-hq"
|
| 483 |
+
src="d3-benchmark-comparison.html"
|
| 484 |
+
title="Is Synthetic Data Enough? (FW-Edu HQ Source)"
|
| 485 |
+
desc="Figure: Synthetic-only vs mixed training with FW-Edu (HQ) as source."
|
| 486 |
+
config={{
|
| 487 |
+
defaultView: "line",
|
| 488 |
+
datasetNames: {
|
| 489 |
+
"mix-fw_edu_hq-faq_1b_hq": "Mix: FAQ + FW-Edu (HQ)",
|
| 490 |
+
"mix-fw_edu_hq-tutorial_1b_hq": "Mix: Tutorial + FW-Edu (HQ)",
|
| 491 |
+
dclm: "DCLM",
|
| 492 |
+
faq_1b_hq: "FAQ Only",
|
| 493 |
+
tutorial_1b_hq: "Tutorial Only",
|
| 494 |
+
fw_edu_hq: "FineWeb-Edu (HQ)"
|
| 495 |
+
}
|
| 496 |
+
}}
|
| 497 |
+
/>
|
| 498 |
|
| 499 |
#### Does increased diversity help?
|
| 500 |
|
|
|
|
| 502 |
**Mixing rephrasing approaches**
|
| 503 |
We were wondering whether mixing the best performing rephrasing approaches can improve over the individual approaches. We find no significant increase over the best performing approach (mix-fw_edu_hq-math_1b_hq). It seems that when we mix together enough different prompts (mix-tutorial_1b_hq-faq_1b_hq-table_1b_hq-math_1b_hq), we don't necessarily need the source dataset (fw_edu_hq) for good performance. This could mean that when just training on one synthetic dataset we need the original dataset for diversity, but when we mix multiple ones it is not necessary. However, it does not hurt and is an easy way of increasing the dataset size while keeping the performance high. To follow up it would be interesting to study with how little synthetic data we can get away with without performance drops.
|
| 504 |
|
| 505 |
+
<HtmlEmbed
|
| 506 |
+
id="mixing-approaches"
|
| 507 |
+
src="d3-benchmark-comparison.html"
|
| 508 |
+
title="Mixing Rephrasing Approaches"
|
| 509 |
+
desc="Figure: Mixing multiple prompts vs individual prompts."
|
| 510 |
+
config={{
|
| 511 |
+
defaultView: "line",
|
| 512 |
+
datasetNames: {
|
| 513 |
+
"mix-fw_edu_hq-tutorial_1b_hq-fw_edu_hq-faq_1b_hq-table_1b_hq-math_1b_hq": "All Prompts + FW-Edu (HQ)",
|
| 514 |
+
"mix-fw_edu_hq-math_1b_hq": "Math",
|
| 515 |
+
"mix-tutorial_1b_hq-faq_1b_hq-table_1b_hq-math_1b_hq": "All Prompts (No Source)",
|
| 516 |
+
"mix-fw_edu_hq-table_1b_hq": "Table",
|
| 517 |
+
"mix-fw_edu_hq-faq_1b_hq": "FAQ",
|
| 518 |
+
"mix-fw_edu_hq-tutorial_1b_hq": "Tutorial",
|
| 519 |
+
dclm: "DCLM",
|
| 520 |
+
fw_edu_hq: "FineWeb-Edu (HQ)"
|
| 521 |
+
}
|
| 522 |
+
}}
|
| 523 |
+
/>
|
| 524 |
|
| 525 |
**Mixing model families**
|
| 526 |
We rephrased using different model families and saw SmolLM2 and Falcon3 clearly outperform Llama3.2 and Granite3. Now we wonder whether mixing the rephrased outputs of multiple models improves performance through increased diversity.
|
| 527 |
|
| 528 |
+
<HtmlEmbed
|
| 529 |
+
id="mixing-model-families"
|
| 530 |
+
src="d3-benchmark-comparison.html"
|
| 531 |
+
title="Mixing Model Families"
|
| 532 |
+
desc="Figure: Mixing rephrased outputs from different model families."
|
| 533 |
+
config={{
|
| 534 |
+
defaultView: "line",
|
| 535 |
+
datasetNames: {
|
| 536 |
+
"mix-fw_edu_hq-tutorial_smollm2_1.7b_hq": "SmolLM2",
|
| 537 |
+
"mix-fw_edu_hq-tutorial_smollm2_1.7b_hq-tutorial_falcon3_1b_hq": "SmolLM2 + Falcon3",
|
| 538 |
+
"mix-fw_edu_hq-tutorial_smollm2_1.7b_hq-tutorial_llama3.2_1b_hq": "SmolLM2 + Llama-3.2",
|
| 539 |
+
"mix-fw_edu_hq-tutorial_llama3.2_1b_hq-tutorial_granite3_1b_hq": "Llama-3.2 + Granite3",
|
| 540 |
+
"mix-fw_edu_hq-tutorial_llama3.2_1b_hq": "Llama-3.2",
|
| 541 |
+
dclm: "DCLM",
|
| 542 |
+
fw_edu_hq: "FineWeb-Edu (HQ)"
|
| 543 |
+
}
|
| 544 |
+
}}
|
| 545 |
+
/>
|
| 546 |
|
| 547 |
It turns out that benchmark performance does not improve through increased rephrasing model diversity but is largely an average of the mixed datasets performance (smollm2 and falcon3 are similar to just smollm2, smollm2 and llama3.2 lie in between smollm2 and llama3.2, llama3.2 and granite3 are similar to just llama3.2).
|
| 548 |
**Mixing both rephrasing approaches and model families**
|
| 549 |
Maybe we need more diversity by mixing both rephrasing approaches and model families?
|
| 550 |
|
| 551 |
+
<HtmlEmbed
|
| 552 |
+
id="mixing-both"
|
| 553 |
+
src="d3-benchmark-comparison.html"
|
| 554 |
+
title="Mixing Approaches and Model Families"
|
| 555 |
+
desc="Figure: Mixing both rephrasing approaches and model families."
|
| 556 |
+
config={{
|
| 557 |
+
defaultView: "line",
|
| 558 |
+
datasetNames: {
|
| 559 |
+
"mix-fw_edu_hq-faq_smollm2_1.7b_hq": "FAQ (SmolLM2)",
|
| 560 |
+
"mix-fw_edu_hq-faq_smollm2_1.7b_hq-tutorial_falcon3_1b_hq": "FAQ (SmolLM2) + Tutorial (Falcon3)",
|
| 561 |
+
"mix-fw_edu_hq-tutorial_smollm2_1.7b_hq": "Tutorial (SmolLM2)",
|
| 562 |
+
"mix-fw_edu_hq-tutorial_smollm2_1.7b_hq-tutorial_falcon3_1b_hq": "Tutorial (SmolLM2) + Tutorial (Falcon3)",
|
| 563 |
+
"mix-fw_edu_hq-tutorial_falcon3_1b_hq": "Tutorial (Falcon3)",
|
| 564 |
+
"mix-fw_edu_hq-faq_falcon3_1b_hq": "FAQ (Falcon3)",
|
| 565 |
+
dclm: "DCLM",
|
| 566 |
+
fw_edu_hq: "FineWeb-Edu (HQ)"
|
| 567 |
+
}
|
| 568 |
+
}}
|
| 569 |
+
/>
|
| 570 |
|
| 571 |
No, we get the same results as for just mixing rephrasing approaches or model families independently: the mix is around the average performance instead of resulting in a gain.
|
| 572 |
|
|
|
|
| 574 |
|
| 575 |
The original REWIRE prompt contains many typos and grammar errors. To what extent do typos in the prompt hurt performance?
|
| 576 |
|
| 577 |
+
<HtmlEmbed
|
| 578 |
+
id="typos-effect"
|
| 579 |
+
src="d3-benchmark-comparison.html"
|
| 580 |
+
title="Effect of Typos in Prompt"
|
| 581 |
+
desc="Figure: REWIRE prompt with original typos vs improved version at 1B and 12B scale."
|
| 582 |
+
config={{
|
| 583 |
+
defaultView: "line",
|
| 584 |
+
datasetNames: {
|
| 585 |
+
"mix-fw_edu_hq-guided_rewrite_original_12b_hq": "Original (12B)",
|
| 586 |
+
"mix-fw_edu_hq-guided_rewrite_improved_12b_hq": "Improved (12B)",
|
| 587 |
+
dclm: "DCLM",
|
| 588 |
+
"mix-fw_edu_hq-guided_rewrite_original_1b_hq": "Original (1B)",
|
| 589 |
+
"mix-fw_edu_hq-guided_rewrite_improved_1b_hq": "Improved (1B)",
|
| 590 |
+
fw_edu_hq: "FineWeb-Edu (HQ)"
|
| 591 |
+
}
|
| 592 |
+
}}
|
| 593 |
+
/>
|
| 594 |
|
| 595 |
Surprisingly, typos don't have a negative effect on downstream model performance. For the 1b model, even the opposite is the case.
|
| 596 |
|