reuAC commited on
Commit
672259a
·
verified ·
1 Parent(s): 9bb37d1

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +14 -35
  2. .gitignore +11 -0
  3. LICENSE +21 -0
  4. README.md +172 -0
  5. README_CN.md +199 -0
  6. bench.py +182 -0
  7. check.py +122 -0
  8. config/base.py +41 -0
  9. config/bench_base.py +14 -0
  10. config/bench_gpt2.py +18 -0
  11. config/eval_gpt2.py +5 -0
  12. config/eval_gpt2_large.py +5 -0
  13. config/eval_gpt2_medium.py +5 -0
  14. config/eval_gpt2_xl.py +5 -0
  15. config/finetune_shakespeare.py +45 -0
  16. config/sample_base.py +12 -0
  17. config/sample_gpt2.py +17 -0
  18. config/sample_gpt2_new.py +17 -0
  19. config/sample_gpt2_new_nolr.py +17 -0
  20. config/sample_reflow_1.py +17 -0
  21. config/sample_reflow_1_big.py +17 -0
  22. config/sample_reflow_1_lite.py +17 -0
  23. config/sample_reflow_1_small.py +17 -0
  24. config/sample_reflow_1_small_sp.py +17 -0
  25. config/sample_reflow_1_topk.py +17 -0
  26. config/sample_reflow_1_topk_big.py +17 -0
  27. config/sample_sft_reflow_1.py +17 -0
  28. config/train_gpt2.py +45 -0
  29. config/train_gpt2_new.py +45 -0
  30. config/train_gpt2_new_nolr.py +45 -0
  31. config/train_gpt2_new_nolr_resume.py +45 -0
  32. config/train_gpt2_new_resume.py +45 -0
  33. config/train_gpt2_resume.py +45 -0
  34. config/train_reflow_1.py +46 -0
  35. config/train_reflow_1_big.py +46 -0
  36. config/train_reflow_1_big_resume.py +46 -0
  37. config/train_reflow_1_lite.py +49 -0
  38. config/train_reflow_1_lite_resume.py +49 -0
  39. config/train_reflow_1_resume.py +46 -0
  40. config/train_reflow_1_small.py +46 -0
  41. config/train_reflow_1_small_resume.py +46 -0
  42. config/train_reflow_1_small_sp.py +46 -0
  43. config/train_reflow_1_small_sp_resume.py +46 -0
  44. config/train_reflow_1_topk.py +48 -0
  45. config/train_reflow_1_topk_big.py +48 -0
  46. config/train_reflow_1_topk_big_resume.py +48 -0
  47. config/train_reflow_base.py +43 -0
  48. config/train_resume.py +44 -0
  49. config/train_sft.py +42 -0
  50. config/train_shakespeare_char.py +42 -0
.gitattributes CHANGED
@@ -1,35 +1,14 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ # Override jupyter in Github language stats for more accurate estimate of repo code languages
2
+ # reference: https://github.com/github/linguist/blob/master/docs/overrides.md#generated-code
3
+ *.ipynb linguist-generated
4
+ out/gpt2/ckpt.pt filter=lfs diff=lfs merge=lfs -text
5
+ out/gpt2-new/ckpt.pt filter=lfs diff=lfs merge=lfs -text
6
+ out/reflow-1/ckpt.pt filter=lfs diff=lfs merge=lfs -text
7
+ out/reflow-1-big/ckpt.pt filter=lfs diff=lfs merge=lfs -text
8
+ out/reflow-1-lite/ckpt.pt filter=lfs diff=lfs merge=lfs -text
9
+ out/reflow-1-small/ckpt.pt filter=lfs diff=lfs merge=lfs -text
10
+ out/reflow-1-small-sp/ckpt.pt filter=lfs diff=lfs merge=lfs -text
11
+ out/reflow-1-topk/ckpt.pt filter=lfs diff=lfs merge=lfs -text
12
+ out/reflow-1-topk-big/ckpt.pt filter=lfs diff=lfs merge=lfs -text
13
+ paper/paper-cn.pdf filter=lfs diff=lfs merge=lfs -text
14
+ paper/paper.pdf filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.gitignore ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .DS_Store
2
+ .idea
3
+ .ipynb_checkpoints/
4
+ .vscode
5
+ __pycache__/
6
+ *.bin
7
+ *.pkl
8
+ *.pyc
9
+ input.txt
10
+ env/
11
+ venv/
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2022 Andrej Karpathy
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ language:
4
+ - en
5
+ - zh
6
+ tags:
7
+ - transformer
8
+ - interpretability
9
+ - mechanistic-interpretability
10
+ - language-model
11
+ - signal-decomposition
12
+ - sparse-representations
13
+ - pytorch
14
+ datasets:
15
+ - openwebtext
16
+ pipeline_tag: text-generation
17
+ ---
18
+
19
+ # reFlow
20
+
21
+ **A Metal Soul In My Hand** — A feature-decoupled Transformer architecture with native interpretability.
22
+
23
+ reFlow reconstructs the traditional full-rank embedding matrix into the product of a **Recipe Matrix** $W_{recipe} \in \mathbb{R}^{V \times S}$ and a **Signal Basis Matrix** $W_{basis} \in \mathbb{R}^{S \times d}$, forcing the model to maintain a set of continuous, low-redundancy signal bases in latent space. A dynamic vocabulary matrix $W_{vocab} = W_{recipe} \times W_{basis}$ is reconstructed in real-time at each forward pass, serving simultaneously as both the embedding matrix and the output projection matrix.
24
+
25
+ > **Paper**: [English (PDF)](./paper/paper.pdf) | [中文 (PDF)](./paper/paper-cn.pdf)
26
+
27
+ ## Project Structure
28
+
29
+ ```
30
+ reFlow/
31
+ ├── train.py # Training script (single GPU / DDP)
32
+ ├── sample.py # Text generation from trained models
33
+ ├── experiment.py # 12-experiment interpretability suite (Chinese)
34
+ ├── experiment_en.py # 12-experiment interpretability suite (English)
35
+ ├── check.py # Checkpoint parameter inspector
36
+ ├── bench.py # Performance benchmarking
37
+ ├── models/
38
+ │ ├── gpt2.py # Standard GPT-2 baseline
39
+ │ ├── gpt2-new.py # Modernized GPT-2 (RoPE + SwiGLU + RMSNorm)
40
+ │ ├── reflow.py # reFlow base architecture
41
+ │ ├── reflow-topk.py # reFlow with ReLU + Top-K hard sparsity
42
+ │ └── reflow-lite.py # reFlow with GQA + reduced MLP
43
+ ├── config/ # Training / sampling / eval configurations
44
+ ├── data/
45
+ │ ├── openwebtext/ # OpenWebText dataset preparation
46
+ │ └── sft-lima/ # LIMA SFT dataset preparation
47
+ └── out/ # Checkpoints and experiment reports
48
+ ```
49
+
50
+ ## Installation
51
+
52
+ ### Prerequisites
53
+
54
+ - Python 3.10+
55
+ - CUDA-compatible GPU (tested on Tesla T4 x4)
56
+
57
+ ### 1. PyTorch (CUDA 12.8)
58
+
59
+ ```bash
60
+ pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
61
+ ```
62
+
63
+ > Adjust the CUDA version in the URL to match your driver. See [PyTorch Get Started](https://pytorch.org/get-started/locally/).
64
+
65
+ ### 2. Core Dependencies
66
+
67
+ ```bash
68
+ pip install datasets tiktoken wandb tqdm
69
+ ```
70
+
71
+ ### 3. Experiment Suite Dependencies
72
+
73
+ The interpretability experiments (`experiment.py`) require additional packages:
74
+
75
+ ```bash
76
+ pip install numpy matplotlib seaborn scikit-learn scipy adjustText
77
+ ```
78
+
79
+ ### Quick Install (All-in-One)
80
+
81
+ ```bash
82
+ pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
83
+ pip install datasets tiktoken wandb tqdm numpy matplotlib seaborn scikit-learn scipy adjustText
84
+ ```
85
+
86
+ ## Data Preparation
87
+
88
+ ### OpenWebText
89
+
90
+ ```bash
91
+ python data/openwebtext/prepare.py
92
+ ```
93
+
94
+ This downloads the OpenWebText corpus (~54 GB) and tokenizes it with the GPT-2 BPE tokenizer. Output: `data/openwebtext/train.bin` (~17 GB, ~9B tokens) and `val.bin`.
95
+
96
+ ## Training
97
+
98
+ All configurations are in `config/`. No CLI overrides — all hyperparameters must be set in the config file.
99
+
100
+ ### Single GPU
101
+
102
+ ```bash
103
+ python train.py config/train_reflow_1.py
104
+ ```
105
+
106
+ ### Multi-GPU (DDP)
107
+
108
+ ```bash
109
+ torchrun --standalone --nproc_per_node=4 train.py config/train_reflow_1.py
110
+ ```
111
+
112
+ ### Available Training Configs
113
+
114
+ | Config | Architecture | Layers | Params | Notes |
115
+ |--------|-------------|--------|--------|-------|
116
+ | `train_gpt2.py` | GPT-2 | 36 | 505.62M | Standard baseline |
117
+ | `train_gpt2_new.py` | GPT-2-New | 36 | 514.01M | + RoPE, SwiGLU, RMSNorm |
118
+ | `train_reflow_1.py` | reFlow | 32 | 463.67M | Base reFlow, constant lr |
119
+ | `train_reflow_1_big.py` | reFlow | 36 | 515.06M | lr decay, for interpretability |
120
+ | `train_reflow_1_topk_big.py` | reFlow-TopK | 36 | 515.06M | + ReLU + Top-64 sparsity |
121
+ | `train_reflow_1_lite.py` | reFlow-Lite | 32 | 413.34M | + GQA, reduced MLP |
122
+ | `train_reflow_1_small.py` | reFlow | 6 | 46.47M | Small-scale validation |
123
+
124
+ ### Resume Training
125
+
126
+ Append `_resume` to the config name (e.g., `train_reflow_1_big_resume.py`).
127
+
128
+ ## Text Generation
129
+
130
+ ```bash
131
+ python sample.py config/sample_reflow_1.py
132
+ ```
133
+
134
+ Edit the config file to change the prompt, temperature, top-k, etc.
135
+
136
+ ## Interpretability Experiments
137
+
138
+ The experiment suite runs 12 analyses on a trained reFlow model. Both Chinese and English versions are available:
139
+
140
+ ```bash
141
+ python experiment_en.py config/train_reflow_1_big.py # English
142
+ python experiment.py config/train_reflow_1_big.py # Chinese
143
+ ```
144
+
145
+ An interactive menu will appear:
146
+
147
+ | # | Experiment | Group |
148
+ |---|-----------|-------|
149
+ | 1 | Recipe Atlas — recipe-space nearest neighbors | A. Signal Identity |
150
+ | 2 | Sparsity Profile — activation sparsity analysis | A. Signal Identity |
151
+ | 3 | Basis Geometry — singular value & effective rank | A. Signal Identity |
152
+ | 4 | Semantic Galaxy — PCA clustering visualization | B. Semantic Properties |
153
+ | 5 | Semantic Algebra — vector arithmetic (king − man + woman = queen) | B. Semantic Properties |
154
+ | 6 | Typo Resilience — robustness to spelling errors | B. Semantic Properties |
155
+ | 7 | Layer Evolution — per-layer probability crystallization | C. Mechanistic Analysis |
156
+ | 8 | Signal Flow — signal activation heatmaps across layers | C. Mechanistic Analysis |
157
+ | 9 | Causal Ablation — progressive signal knockout curves | C. Mechanistic Analysis |
158
+ | 10 | Emotion Surgery — sentiment steering via signal injection | D. Control & Steering |
159
+ | 11 | Concept Inception — binary-search concept implantation | D. Control & Steering |
160
+ | 12 | Genetic Hijack — global recipe matrix manipulation | D. Control & Steering |
161
+
162
+ Enter `all` to run all experiments, or specific numbers (e.g., `1 3 5`). Reports are saved to `out/<model>/audit_reports/`.
163
+
164
+ ## Checkpoint Inspection
165
+
166
+ ```bash
167
+ python check.py config/train_reflow_1.py out/reflow-1/ckpt.pt
168
+ ```
169
+
170
+ ## License
171
+
172
+ MIT License. Based on [nanoGPT](https://github.com/karpathy/nanoGPT) by Andrej Karpathy.
README_CN.md ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # reFlow
2
+
3
+ **A Metal Soul In My Hand** — 具备原生可解释性的特征解耦 Transformer 架构。
4
+
5
+ reFlow 将传统全秩词嵌入矩阵重构为**配方矩阵** $W_{recipe} \in \mathbb{R}^{V \times S}$ 与**信号基底矩阵** $W_{basis} \in \mathbb{R}^{S \times d}$ 的乘积形式,迫使模型在潜空间中维护一组连续、低冗余的信号基底。动态词表矩阵 $W_{vocab} = W_{recipe} \times W_{basis}$ 在每次前向传播中实时重构,同时作为嵌入矩阵与输出投影矩阵使用。
6
+
7
+ > **论文**: [English (PDF)](./paper/paper.pdf) | [中文 (PDF)](./paper/paper-cn.pdf)
8
+
9
+ ## 项目结构
10
+
11
+ ```
12
+ reFlow/
13
+ ├── train.py # 训练脚本(单卡 / DDP 多卡)
14
+ ├── sample.py # 从训练好的模型生成文本
15
+ ├── experiment.py # 12 项可解释性实验套件(中文)
16
+ ├── experiment_en.py # 12 项可解释性实验套件(English)
17
+ ├── check.py # Checkpoint 参数检查工具
18
+ ├── bench.py # 性能基准测试
19
+ ├── models/
20
+ │ ├── gpt2.py # 标准 GPT-2 基线
21
+ │ ├── gpt2-new.py # 现代化 GPT-2(RoPE + SwiGLU + RMSNorm)
22
+ │ ├── reflow.py # reFlow 基础架构
23
+ │ ├── reflow-topk.py # reFlow + ReLU + Top-K 硬稀疏变体
24
+ │ └── reflow-lite.py # reFlow + GQA + 缩减 MLP 轻量变体
25
+ ├── config/ # 训练 / 采样 / 评估配置文件
26
+ ├── data/
27
+ │ ├── openwebtext/ # OpenWebText 数据集预处理
28
+ │ └── sft-lima/ # LIMA SFT 数据集预处理
29
+ └── out/ # Checkpoints 与实验报告输出
30
+ ```
31
+
32
+ ## 安装
33
+
34
+ ### 环境要求
35
+
36
+ - Python 3.10+
37
+ - 支持 CUDA 的 GPU(实验环境:Tesla T4 x4)
38
+
39
+ ### 1. 安装 PyTorch(CUDA 12.8)
40
+
41
+ ```bash
42
+ pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
43
+ ```
44
+
45
+ > 请根据你的 CUDA 驱动版本调整 URL 中的版本号。详见 [PyTorch 官方安装指南](https://pytorch.org/get-started/locally/)。
46
+
47
+ ### 2. 核心依赖
48
+
49
+ ```bash
50
+ pip install datasets tiktoken wandb tqdm
51
+ ```
52
+
53
+ ### 3. 可解释性实验依赖
54
+
55
+ 运行 `experiment.py` 需要额外安装以下包:
56
+
57
+ ```bash
58
+ pip install numpy matplotlib seaborn scikit-learn scipy adjustText
59
+ ```
60
+
61
+ ### 一键安装(全部依赖)
62
+
63
+ ```bash
64
+ pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
65
+ pip install datasets tiktoken wandb tqdm numpy matplotlib seaborn scikit-learn scipy adjustText
66
+ ```
67
+
68
+ ## 数据准备
69
+
70
+ ### OpenWebText
71
+
72
+ ```bash
73
+ python data/openwebtext/prepare.py
74
+ ```
75
+
76
+ 该脚本会下载 OpenWebText 语料库(约 54 GB),使用 GPT-2 BPE 分词器进行编码。输出:`data/openwebtext/train.bin`(约 17 GB,约 90 亿 tokens)和 `val.bin`。
77
+
78
+ ## 训练
79
+
80
+ 所有超参数均在 `config/` 目录下的配置文件中指定,不支持命令行覆盖。
81
+
82
+ ### 单卡训练
83
+
84
+ ```bash
85
+ python train.py config/train_reflow_1.py
86
+ ```
87
+
88
+ ### 多卡训练(DDP)
89
+
90
+ ```bash
91
+ torchrun --standalone --nproc_per_node=4 train.py config/train_reflow_1.py
92
+ ```
93
+
94
+ ### 可用训练配置
95
+
96
+ | 配置文件 | 架构 | 层数 | 参数量 | 说明 |
97
+ |---------|------|------|--------|------|
98
+ | `train_gpt2.py` | GPT-2 | 36 | 505.62M | 标准基线 |
99
+ | `train_gpt2_new.py` | GPT-2-New | 36 | 514.01M | + RoPE、SwiGLU、RMSNorm |
100
+ | `train_reflow_1.py` | reFlow | 32 | 463.67M | 基础 reFlow,恒定学习率 |
101
+ | `train_reflow_1_big.py` | reFlow | 36 | 515.06M | 学习率衰减,用于可解释性实验 |
102
+ | `train_reflow_1_topk_big.py` | reFlow-TopK | 36 | 515.06M | + ReLU + Top-64 稀疏化 |
103
+ | `train_reflow_1_lite.py` | reFlow-Lite | 32 | 413.34M | + GQA,缩减 MLP |
104
+ | `train_reflow_1_small.py` | reFlow | 6 | 46.47M | 小规模验证 |
105
+
106
+ ### 断点续训
107
+
108
+ 使用对应的 `_resume` 配置文件(如 `train_reflow_1_big_resume.py`)。
109
+
110
+ ## 文本生成
111
+
112
+ ```bash
113
+ python sample.py config/sample_reflow_1.py
114
+ ```
115
+
116
+ 在配置文件中修改 prompt、temperature、top_k 等参数。
117
+
118
+ ## 可解释性实验
119
+
120
+ 实验套件对训练好的 reFlow 模型执行 12 项分析,提供中英文两个版本:
121
+
122
+ ```bash
123
+ python experiment.py config/train_reflow_1_big.py # 中文版
124
+ python experiment_en.py config/train_reflow_1_big.py # English
125
+ ```
126
+
127
+ 运行后将出现交互式菜单:
128
+
129
+ | 编号 | 实验名称 | 分组 |
130
+ |------|---------|------|
131
+ | 1 | 配方空间图谱 — 配方近邻与聚类热力图 | A. 信号本体 |
132
+ | 2 | 信号稀疏性分析 — 激活稀疏率统计 | A. 信号本体 |
133
+ | 3 | 信号基底几何 — 奇异值分解与有效秩 | A. 信号本体 |
134
+ | 4 | 语义星空图 — PCA 聚类可视化 | B. 语义性质 |
135
+ | 5 | 语义代数运算 — 向量算术(king − man + woman = queen) | B. 语义性质 |
136
+ | 6 | 拼写鲁棒性 — 对拼写错误的容忍度 | B. 语义性质 |
137
+ | 7 | 层级概率演化 — 逐层预测概率结晶过程 | C. 机械分析 |
138
+ | 8 | 信号流追踪 — 信号激活热力图 | C. 机械分析 |
139
+ | 9 | 因果消融曲线 — 逐信号消融概率变化 | C. 机械分析 |
140
+ | 10 | 情绪手术 — 信号注入实现情感翻转 | D. 操控验证 |
141
+ | 11 | 概念注入 — 二分搜索概念植入 | D. 操控验证 |
142
+ | 12 | 基因库篡改 — 全局配方矩阵操控 | D. 操控验证 |
143
+
144
+ 输入 `all` 运行全部实验,或输入编号(如 `1 3 5`)选择性运行。实验报告保存至 `out/<模型名>/audit_reports/`。
145
+
146
+ ## Checkpoint 检查
147
+
148
+ ```bash
149
+ python check.py config/train_reflow_1.py out/reflow-1/ckpt.pt
150
+ ```
151
+
152
+ ## 复现论文实验
153
+
154
+ ### 第 4 章:收敛实验
155
+
156
+ 依次训练各模型变体并对比训练曲线:
157
+
158
+ ```bash
159
+ # GPT-2 基线
160
+ torchrun --standalone --nproc_per_node=4 train.py config/train_gpt2.py
161
+
162
+ # GPT-2-New(现代化组件)
163
+ torchrun --standalone --nproc_per_node=4 train.py config/train_gpt2_new.py
164
+
165
+ # reFlow 基础版
166
+ torchrun --standalone --nproc_per_node=4 train.py config/train_reflow_1.py
167
+
168
+ # reFlow-Big(用于第 5-6 章实验)
169
+ torchrun --standalone --nproc_per_node=4 train.py config/train_reflow_1_big.py
170
+
171
+ # reFlow-TopK-Big(硬稀疏变体)
172
+ torchrun --standalone --nproc_per_node=4 train.py config/train_reflow_1_topk_big.py
173
+
174
+ # reFlow-Lite(轻量版)
175
+ torchrun --standalone --nproc_per_node=4 train.py config/train_reflow_1_lite.py
176
+
177
+ # reFlow-Small(小规模验证)
178
+ torchrun --standalone --nproc_per_node=4 train.py config/train_reflow_1_small.py
179
+ ```
180
+
181
+ ### 第 5 章:可解释性实验(reFlow-1-Big)
182
+
183
+ ```bash
184
+ python experiment.py config/train_reflow_1_big.py
185
+ # 输入 all 运行全部 12 项实验
186
+ ```
187
+
188
+ ### 第 6 章:硬稀疏对比实验(reFlow-1-TopK-Big)
189
+
190
+ ```bash
191
+ python experiment.py config/train_reflow_1_topk_big.py
192
+ # 输入 all 运行全部 12 项实验
193
+ ```
194
+
195
+ 对比两组实验报告即可复现论文第 6 章的对比表格。
196
+
197
+ ## 许可证
198
+
199
+ MIT License。基于 [nanoGPT](https://github.com/karpathy/nanoGPT)(Andrej Karpathy)二次开发。
bench.py ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Benchmark script for model performance testing
3
+
4
+ REQUIRED:
5
+ 1. You must specify a config file from the config/ directory
6
+ 2. All configuration must be in the config file. No CLI overrides allowed
7
+
8
+ Usage:
9
+ python bench.py <config_file>
10
+
11
+ Example:
12
+ python bench.py config/bench_gpt2.py
13
+ """
14
+ import sys
15
+ import os
16
+
17
+ # -----------------------------------------------------------------------------
18
+ # Configuration loading (BEFORE imports to validate config first)
19
+ # -----------------------------------------------------------------------------
20
+ if len(sys.argv) != 2:
21
+ print("ERROR: Invalid arguments!")
22
+ print("Usage: python bench.py <config_file>")
23
+ print("Available configs in config/:")
24
+ print(" - bench_gpt2.py")
25
+ sys.exit(1)
26
+
27
+ config_file = sys.argv[1]
28
+
29
+ # Disallow --key=value arguments
30
+ for arg in sys.argv[1:]:
31
+ if arg.startswith('--'):
32
+ print(f"ERROR: CLI overrides are not supported. All config must be in file: {config_file}")
33
+ sys.exit(1)
34
+
35
+ # Load config
36
+ print(f"Loading config from: {config_file}")
37
+ exec(open(config_file).read())
38
+
39
+ # Validate required config keys
40
+ required_keys = ['model_config']
41
+ missing_keys = [k for k in required_keys if k not in globals()]
42
+ if missing_keys:
43
+ print(f"ERROR: Missing required config keys: {missing_keys}")
44
+ sys.exit(1)
45
+
46
+ # Load model configuration
47
+ model_config = globals()['model_config']
48
+ model_file = f"models/{model_config}.py"
49
+ try:
50
+ exec(open(model_file).read())
51
+ except FileNotFoundError:
52
+ print(f"ERROR: Model file not found: {model_file}")
53
+ sys.exit(1)
54
+
55
+ # Get model-specific required config keys from GPTConfig
56
+ model_required_keys = []
57
+ if 'GPTConfig' in globals():
58
+ config_class = globals()['GPTConfig']
59
+ import dataclasses
60
+ for field in dataclasses.fields(config_class):
61
+ model_required_keys.append(field.name)
62
+
63
+ # Validate model-specific config keys
64
+ if init_from == 'scratch':
65
+ missing_model_keys = [k for k in model_required_keys if k not in globals()]
66
+ if missing_model_keys:
67
+ print(f"ERROR: Missing required model config keys for {model_config}: {missing_model_keys}")
68
+ sys.exit(1)
69
+
70
+ # Print configuration
71
+ print("\n" + "=" * 60)
72
+ print("BENCH CONFIGURATION")
73
+ print("=" * 60)
74
+ for key in sorted(globals().keys()):
75
+ val = globals().get(key)
76
+ if isinstance(val, (int, float, bool, str)) and not key.startswith('_'):
77
+ print(f" {key:30s} = {val}")
78
+ print("=" * 60 + "\n")
79
+
80
+ # Now import dependencies
81
+ import os
82
+ from contextlib import nullcontext
83
+ import numpy as np
84
+ import time
85
+ import torch
86
+
87
+ # Import GPTConfig and GPT
88
+ GPTConfig = globals()['GPTConfig']
89
+ GPT = globals()['GPT']
90
+
91
+ # Auto-detect dtype
92
+ if dtype == 'bfloat16' and not (torch.cuda.is_available() and torch.cuda.is_bf16_supported()):
93
+ dtype = 'float16'
94
+
95
+ torch.manual_seed(seed)
96
+ torch.cuda.manual_seed(seed)
97
+ torch.backends.cuda.matmul.allow_tf32 = True
98
+ torch.backends.cudnn.allow_tf32 = True
99
+ device_type = 'cuda' if 'cuda' in device else 'cpu'
100
+ ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
101
+ ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype)
102
+
103
+ # data loading
104
+ if real_data:
105
+ dataset = globals().get('dataset', 'openwebtext')
106
+ data_dir = os.path.join('data', dataset)
107
+ train_data = np.memmap(os.path.join(data_dir, 'train.bin'), dtype=np.uint16, mode='r')
108
+ def get_batch(split):
109
+ data = train_data
110
+ ix = torch.randint(len(data) - block_size, (batch_size,))
111
+ x = torch.stack([torch.from_numpy((data[i:i+block_size]).astype(np.int64)) for i in ix])
112
+ y = torch.stack([torch.from_numpy((data[i+1:i+1+block_size]).astype(np.int64)) for i in ix])
113
+ x, y = x.pin_memory().to(device, non_blocking=True), y.pin_memory().to(device, non_blocking=True)
114
+ return x, y
115
+ else:
116
+ x = torch.randint(50304, (batch_size, block_size), device=device)
117
+ y = torch.randint(50304, (batch_size, block_size), device=device)
118
+ get_batch = lambda split: (x, y)
119
+
120
+ # model init
121
+ gptconf = GPTConfig(
122
+ block_size=block_size,
123
+ n_layer=n_layer,
124
+ n_head=n_head,
125
+ n_embd=n_embd,
126
+ dropout=0,
127
+ bias=bias,
128
+ )
129
+ model = GPT(gptconf)
130
+ model.to(device)
131
+
132
+ optimizer = model.configure_optimizers(weight_decay=1e-2, learning_rate=1e-4, betas=(0.9, 0.95), device_type=device_type)
133
+
134
+ if compile:
135
+ print("Compiling model...")
136
+ model = torch.compile(model)
137
+
138
+ if profile:
139
+ wait, warmup, active = 5, 5, 5
140
+ num_steps = wait + warmup + active
141
+ with torch.profiler.profile(
142
+ activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],
143
+ schedule=torch.profiler.schedule(wait=wait, warmup=warmup, active=active, repeat=1),
144
+ on_trace_ready=torch.profiler.tensorboard_trace_handler('./bench_log'),
145
+ record_shapes=False,
146
+ profile_memory=False,
147
+ with_stack=False,
148
+ with_flops=True,
149
+ with_modules=False,
150
+ ) as prof:
151
+ X, Y = get_batch('train')
152
+ for k in range(num_steps):
153
+ with ctx:
154
+ logits, loss = model(X, Y)
155
+ X, Y = get_batch('train')
156
+ optimizer.zero_grad(set_to_none=True)
157
+ loss.backward()
158
+ optimizer.step()
159
+ lossf = loss.item()
160
+ print(f"{k}/{num_steps} loss: {lossf:.4f}")
161
+ prof.step()
162
+ else:
163
+ # simple benchmarking
164
+ torch.cuda.synchronize()
165
+ for stage, num_steps in enumerate([10, 20]):
166
+ t0 = time.time()
167
+ X, Y = get_batch('train')
168
+ for k in range(num_steps):
169
+ with ctx:
170
+ logits, loss = model(X, Y)
171
+ X, Y = get_batch('train')
172
+ optimizer.zero_grad(set_to_none=True)
173
+ loss.backward()
174
+ optimizer.step()
175
+ lossf = loss.item()
176
+ print(f"{k}/{num_steps} loss: {lossf:.4f}")
177
+ torch.cuda.synchronize()
178
+ t1 = time.time()
179
+ dt = t1 - t0
180
+ mfu = model.estimate_mfu(batch_size * 1 * num_steps, dt)
181
+ if stage == 1:
182
+ print(f"time per iteration: {dt/num_steps*1000:.4f}ms, MFU: {mfu*100:.2f}%")
check.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Check model parameters from a checkpoint
3
+
4
+ Usage:
5
+ python check.py <config_file> <checkpoint_file>
6
+
7
+ Example:
8
+ python check.py config/train_reflow_web.py out-web/ckpt.pt
9
+ """
10
+ import sys
11
+ import os
12
+ import torch
13
+
14
+ # -----------------------------------------------------------------------------
15
+ # Configuration loading
16
+ # -----------------------------------------------------------------------------
17
+ if len(sys.argv) != 3:
18
+ print("ERROR: Invalid arguments!")
19
+ print("Usage: python check.py <config_file> <checkpoint_file>")
20
+ print("Example: python check.py config/train_reflow_web.py out-web/ckpt.pt")
21
+ sys.exit(1)
22
+
23
+ config_file = sys.argv[1]
24
+ checkpoint_file = sys.argv[2]
25
+
26
+ if not os.path.exists(config_file):
27
+ print(f"ERROR: Config file not found: {config_file}")
28
+ sys.exit(1)
29
+
30
+ if not os.path.exists(checkpoint_file):
31
+ print(f"ERROR: Checkpoint file not found: {checkpoint_file}")
32
+ sys.exit(1)
33
+
34
+ # Load config
35
+ print(f"Loading config from: {config_file}")
36
+ exec(open(config_file).read())
37
+
38
+ # Load model configuration
39
+ model_config = globals().get('model_config')
40
+ if not model_config:
41
+ print("ERROR: 'model_config' is required in config file")
42
+ sys.exit(1)
43
+
44
+ model_file = f"models/{model_config}.py"
45
+ try:
46
+ exec(open(model_file).read())
47
+ except FileNotFoundError:
48
+ print(f"ERROR: Model file not found: {model_file}")
49
+ sys.exit(1)
50
+
51
+ # Import GPTConfig and GPT
52
+ GPTConfig = globals()['GPTConfig']
53
+ GPT = globals()['GPT']
54
+
55
+ # Load checkpoint
56
+ print(f"Loading checkpoint from: {checkpoint_file}")
57
+ checkpoint = torch.load(checkpoint_file, map_location='cpu')
58
+ model_args = checkpoint['model_args']
59
+
60
+ # Create model and load weights
61
+ gptconf = GPTConfig(**model_args)
62
+ model = GPT(gptconf)
63
+ state_dict = checkpoint['model']
64
+
65
+ # Handle PyTorch 2.0+ compiled model keys
66
+ unwanted_prefix = '_orig_mod.'
67
+ for k in list(state_dict.keys()):
68
+ if k.startswith(unwanted_prefix):
69
+ state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
70
+
71
+ model.load_state_dict(state_dict)
72
+
73
+ # Print model information
74
+ print("\n" + "=" * 60)
75
+ print("MODEL INFORMATION")
76
+ print("=" * 60)
77
+
78
+ print(f"\nModel Architecture: {model_config}")
79
+ print(f"Checkpoint: {checkpoint_file}")
80
+
81
+ print(f"\nModel Arguments:")
82
+ for k, v in model_args.items():
83
+ print(f" {k:20s} = {v}")
84
+
85
+ print(f"\nTotal Parameters: {model.get_num_params()/1e6:.2f}M")
86
+
87
+ # Count parameters by component
88
+ if hasattr(model, 'transformer'):
89
+ print("\nParameters by component:")
90
+
91
+ # Show wte (embedding) - for Reflow includes vocab_to_signals + signal_basis
92
+ if hasattr(model.transformer, 'wte'):
93
+ wte = model.transformer.wte
94
+ if hasattr(wte, 'vocab_to_signals'):
95
+ vocab_to_signals_params = wte.vocab_to_signals.weight.numel()
96
+ print(f" transformer.wte.vocab_to_signals: {vocab_to_signals_params/1e6:>10.2f}M")
97
+ if hasattr(wte, 'signal_basis'):
98
+ signal_basis_params = wte.signal_basis.numel()
99
+ print(f" transformer.wte.signal_basis: {signal_basis_params/1e6:>10.2f}M")
100
+ wte_params = sum(p.numel() for p in wte.parameters())
101
+ print(f" transformer.wte (total): {wte_params/1e6:>10.2f}M")
102
+
103
+ # Count transformer.h (layers)
104
+ if hasattr(model.transformer, 'h'):
105
+ h_params = sum(p.numel() for p in model.transformer.h.parameters())
106
+ print(f" transformer.h (all layers): {h_params/1e6:>10.2f}M")
107
+
108
+ # Show ln_f
109
+ if hasattr(model.transformer, 'ln_f'):
110
+ ln_f_params = sum(p.numel() for p in model.transformer.ln_f.parameters())
111
+ print(f" transformer.ln_f: {ln_f_params/1e6:>10.2f}M")
112
+
113
+ print(f"\nTotal trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad)/1e6:.2f}M")
114
+ print(f"Total non-trainable parameters: {sum(p.numel() for p in model.parameters() if not p.requires_grad)/1e6:.2f}M")
115
+
116
+ # Training info if available
117
+ if 'iter_num' in checkpoint:
118
+ print(f"\nTraining Info:")
119
+ print(f" iter_num: {checkpoint['iter_num']}")
120
+ print(f" best_val_loss: {checkpoint.get('best_val_loss', 'N/A')}")
121
+
122
+ print("=" * 60)
config/base.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ out_dir = 'out'
3
+ eval_interval = 2000
4
+ log_interval = 1
5
+ eval_iters = 200
6
+ eval_only = False
7
+ always_save_checkpoint = True
8
+ init_from = 'scratch'
9
+
10
+ wandb_log = False
11
+ wandb_project = 'owt'
12
+ wandb_run_name = 'gpt2'
13
+
14
+ dataset = 'openwebtext'
15
+ gradient_accumulation_steps = 5 * 8
16
+ batch_size = 12
17
+ block_size = 1024
18
+
19
+ n_layer = 12
20
+ n_head = 12
21
+ n_embd = 768
22
+ dropout = 0.0
23
+ bias = False
24
+
25
+ learning_rate = 6e-4
26
+ max_iters = 600000
27
+ weight_decay = 1e-1
28
+ beta1 = 0.9
29
+ beta2 = 0.95
30
+ grad_clip = 1.0
31
+
32
+ decay_lr = True
33
+ warmup_iters = 2000
34
+ lr_decay_iters = 600000
35
+ min_lr = 6e-5
36
+
37
+ backend = 'nccl'
38
+
39
+ device = 'cuda'
40
+ dtype = 'bfloat16'
41
+ compile = True
config/bench_base.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ batch_size = 12
3
+ block_size = 1024
4
+ bias = False
5
+ real_data = True
6
+ seed = 1337
7
+ device = 'cuda'
8
+ dtype = 'bfloat16'
9
+ compile = True
10
+ profile = False
11
+
12
+ n_layer = 12
13
+ n_head = 12
14
+ n_embd = 768
config/bench_gpt2.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ model_config = 'gpt2'
3
+
4
+ batch_size = 12
5
+ block_size = 1024
6
+ bias = False
7
+ real_data = True
8
+ seed = 1337
9
+ device = 'cuda'
10
+ dtype = 'bfloat16'
11
+ compile = True
12
+ profile = False
13
+
14
+ n_layer = 12
15
+ n_head = 12
16
+ n_embd = 768
17
+
18
+ init_from = 'scratch'
config/eval_gpt2.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ batch_size = 8
2
+ eval_iters = 500
3
+ eval_only = True
4
+ wandb_log = False
5
+ init_from = 'gpt2'
config/eval_gpt2_large.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ batch_size = 8
2
+ eval_iters = 500
3
+ eval_only = True
4
+ wandb_log = False
5
+ init_from = 'gpt2-large'
config/eval_gpt2_medium.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ batch_size = 8
2
+ eval_iters = 500
3
+ eval_only = True
4
+ wandb_log = False
5
+ init_from = 'gpt2-medium'
config/eval_gpt2_xl.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ batch_size = 8
2
+ eval_iters = 500
3
+ eval_only = True
4
+ wandb_log = False
5
+ init_from = 'gpt2-xl'
config/finetune_shakespeare.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import time
3
+
4
+ model_config = 'gpt2'
5
+
6
+ out_dir = 'out-shakespeare'
7
+ eval_interval = 5
8
+ log_interval = 1
9
+ eval_iters = 40
10
+ eval_only = False
11
+ always_save_checkpoint = False
12
+ init_from = 'gpt2-xl'
13
+
14
+ wandb_log = False
15
+ wandb_project = 'shakespeare'
16
+ wandb_run_name = 'ft-' + str(time.time())
17
+
18
+ dataset = 'shakespeare'
19
+ gradient_accumulation_steps = 32
20
+ batch_size = 1
21
+ block_size = 1024
22
+
23
+ n_layer = None
24
+ n_head = None
25
+ n_embd = None
26
+ dropout = 0.1
27
+ bias = False
28
+
29
+ learning_rate = 3e-5
30
+ max_iters = 20
31
+ weight_decay = 1e-1
32
+ beta1 = 0.9
33
+ beta2 = 0.95
34
+ grad_clip = 1.0
35
+
36
+ decay_lr = False
37
+ warmup_iters = 0
38
+ lr_decay_iters = 20
39
+ min_lr = 3e-5
40
+
41
+ backend = 'nccl'
42
+
43
+ device = 'cuda'
44
+ dtype = 'bfloat16'
45
+ compile = True
config/sample_base.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ init_from = 'resume'
3
+ out_dir = 'out'
4
+ start = "\n"
5
+ num_samples = 10
6
+ max_new_tokens = 500
7
+ temperature = 0.8
8
+ top_k = 200
9
+ seed = 1337
10
+ device = 'cuda'
11
+ dtype = 'bfloat16'
12
+ compile = False
config/sample_gpt2.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ model_config = 'gpt2'
3
+
4
+ out_dir = 'out/gpt2'
5
+ init_from = 'resume'
6
+
7
+ start = "The sun is bright, the night is dark, fire is hot, and ice is"
8
+ num_samples = 10
9
+ max_new_tokens = 500
10
+ temperature = 0.8
11
+ top_k = 200
12
+
13
+ seed = 1337
14
+
15
+ device = 'cuda'
16
+ dtype = 'bfloat16'
17
+ compile = False
config/sample_gpt2_new.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ model_config = 'gpt2-new'
3
+
4
+ out_dir = 'out/gpt2-new'
5
+ init_from = 'resume'
6
+
7
+ start = "Q: What is the capital of China?\nA: Beijing\nQ: What is the capital of Japan?\nA: Tokyo\nQ: What is the capital of France?\nA:"
8
+ num_samples = 10
9
+ max_new_tokens = 50
10
+ temperature = 0.01
11
+ top_k = 20
12
+
13
+ seed = 1337
14
+
15
+ device = 'cuda'
16
+ dtype = 'float16'
17
+ compile = False
config/sample_gpt2_new_nolr.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ model_config = 'gpt2-new'
3
+
4
+ out_dir = 'out/gpt2-new-nolr'
5
+ init_from = 'resume'
6
+
7
+ start = "The sun is bright, the night is dark, fire is hot, and ice is"
8
+ num_samples = 10
9
+ max_new_tokens = 500
10
+ temperature = 0.8
11
+ top_k = 200
12
+
13
+ seed = 1337
14
+
15
+ device = 'cuda'
16
+ dtype = 'bfloat16'
17
+ compile = False
config/sample_reflow_1.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ model_config = 'reflow'
3
+
4
+ out_dir = 'out/reflow-1'
5
+ init_from = 'resume'
6
+
7
+ start = "Q: What is the capital of China?\nA: Beijing\nQ: What is the capital of Japan?\nA: Tokyo\nQ: What is the capital of France?\nA:"
8
+ num_samples = 10
9
+ max_new_tokens = 50
10
+ temperature = 0.01
11
+ top_k = 20
12
+
13
+ seed = 1337
14
+
15
+ device = 'cuda'
16
+ dtype = 'float16'
17
+ compile = False
config/sample_reflow_1_big.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ model_config = 'reflow'
3
+
4
+ out_dir = 'out/reflow-1-big'
5
+ init_from = 'resume'
6
+
7
+ start = "The sun is bright, the night is dark, fire is hot, and ice is"
8
+ num_samples = 10
9
+ max_new_tokens = 500
10
+ temperature = 0.8
11
+ top_k = 200
12
+
13
+ seed = 1337
14
+
15
+ device = 'cuda'
16
+ dtype = 'float16'
17
+ compile = False
config/sample_reflow_1_lite.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ model_config = 'reflow-lite'
3
+
4
+ out_dir = 'out/reflow-1-lite'
5
+ init_from = 'resume'
6
+
7
+ start = "The sun is bright, the night is dark, fire is hot, and ice is"
8
+ num_samples = 10
9
+ max_new_tokens = 500
10
+ temperature = 0.8
11
+ top_k = 200
12
+
13
+ seed = 1337
14
+
15
+ device = 'cuda'
16
+ dtype = 'float16'
17
+ compile = False
config/sample_reflow_1_small.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ model_config = 'reflow'
3
+
4
+ out_dir = 'out/reflow-1-small'
5
+ init_from = 'resume'
6
+
7
+ start = "The sun is bright, the night is dark, fire is hot, and ice is"
8
+ num_samples = 10
9
+ max_new_tokens = 500
10
+ temperature = 0.8
11
+ top_k = 200
12
+
13
+ seed = 1337
14
+
15
+ device = 'cuda'
16
+ dtype = 'float16'
17
+ compile = False
config/sample_reflow_1_small_sp.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ model_config = 'reflow'
3
+
4
+ out_dir = 'out/reflow-1-small-sp'
5
+ init_from = 'resume'
6
+
7
+ start = "The sun is bright, the night is dark, fire is hot, and ice is"
8
+ num_samples = 10
9
+ max_new_tokens = 500
10
+ temperature = 0.8
11
+ top_k = 200
12
+
13
+ seed = 1337
14
+
15
+ device = 'cuda'
16
+ dtype = 'float16'
17
+ compile = False
config/sample_reflow_1_topk.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ model_config = 'reflow-topk'
3
+
4
+ out_dir = 'out/reflow-1-topk'
5
+ init_from = 'resume'
6
+
7
+ start = "Q: What is the capital of China?\nA: Beijing\nQ: What is the capital of Japan?\nA: Tokyo\nQ: What is the capital of France?\nA:"
8
+ num_samples = 10
9
+ max_new_tokens = 50
10
+ temperature = 0.01
11
+ top_k = 20
12
+
13
+ seed = 1337
14
+
15
+ device = 'cuda'
16
+ dtype = 'float16'
17
+ compile = False
config/sample_reflow_1_topk_big.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ model_config = 'reflow-topk'
3
+
4
+ out_dir = 'out/reflow-1-topk-big'
5
+ init_from = 'resume'
6
+
7
+ start = "Q: What is the capital of China?\nA: Beijing\nQ: What is the capital of Japan?\nA: Tokyo\nQ: What is the capital of France?\nA:"
8
+ num_samples = 10
9
+ max_new_tokens = 50
10
+ temperature = 0.01
11
+ top_k = 20
12
+
13
+ seed = 1337
14
+
15
+ device = 'cuda'
16
+ dtype = 'float16'
17
+ compile = False
config/sample_sft_reflow_1.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ model_config = 'reflow'
3
+
4
+ out_dir = 'out/sft-lima-reflow-1'
5
+ init_from = 'resume'
6
+
7
+ start = "Question: Which city is the capital of France?\nAnswer: "
8
+ num_samples = 10
9
+ max_new_tokens = 500
10
+ temperature = 0.1
11
+ top_k = 20
12
+
13
+ seed = 1337
14
+
15
+ device = 'cuda'
16
+ dtype = 'float16'
17
+ compile = False
config/train_gpt2.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model_config = 'gpt2'
2
+
3
+ log_file = 'logs/gpt2.log'
4
+
5
+ out_dir = 'out/gpt2'
6
+ eval_interval = 500
7
+ log_interval = 1
8
+ eval_iters = 500
9
+ eval_only = False
10
+ always_save_checkpoint = False
11
+ init_from = 'scratch'
12
+
13
+ wandb_log = False
14
+ wandb_project = 'owt'
15
+ wandb_run_name = 'gpt2'
16
+
17
+ dataset = 'openwebtext'
18
+ gradient_accumulation_steps = 64
19
+ batch_size = 1
20
+ block_size = 1024
21
+
22
+ n_layer = 36
23
+ n_head = 16
24
+ n_embd = 1024
25
+ vocab_size = 50304
26
+ dropout = 0.0
27
+ bias = False
28
+
29
+ learning_rate = 3e-4
30
+ max_iters = 50000
31
+ weight_decay = 1e-1
32
+ beta1 = 0.9
33
+ beta2 = 0.95
34
+ grad_clip = 1.0
35
+
36
+ decay_lr = True
37
+ warmup_iters = 1000
38
+ lr_decay_iters = 50000
39
+ min_lr = 3e-5
40
+
41
+ backend = 'nccl'
42
+
43
+ device = 'cuda'
44
+ dtype = 'float16'
45
+ compile = True
config/train_gpt2_new.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model_config = 'gpt2-new'
2
+
3
+ log_file = 'logs/gpt2-new.log'
4
+
5
+ out_dir = 'out/gpt2-new'
6
+ eval_interval = 500
7
+ log_interval = 1
8
+ eval_iters = 500
9
+ eval_only = False
10
+ always_save_checkpoint = False
11
+ init_from = 'scratch'
12
+
13
+ wandb_log = False
14
+ wandb_project = 'owt'
15
+ wandb_run_name = 'gpt2'
16
+
17
+ dataset = 'openwebtext'
18
+ gradient_accumulation_steps = 64
19
+ batch_size = 1
20
+ block_size = 1024
21
+
22
+ n_layer = 36
23
+ n_head = 16
24
+ n_embd = 1024
25
+ vocab_size = 50304
26
+ dropout = 0.0
27
+ bias = False
28
+
29
+ learning_rate = 3e-4
30
+ max_iters = 50000
31
+ weight_decay = 1e-1
32
+ beta1 = 0.9
33
+ beta2 = 0.95
34
+ grad_clip = 1.0
35
+
36
+ decay_lr = True
37
+ warmup_iters = 1000
38
+ lr_decay_iters = 50000
39
+ min_lr = 3e-5
40
+
41
+ backend = 'nccl'
42
+
43
+ device = 'cuda'
44
+ dtype = 'float16'
45
+ compile = True
config/train_gpt2_new_nolr.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model_config = 'gpt2-new'
2
+
3
+ log_file = 'logs/gpt2-new-nolr.log'
4
+
5
+ out_dir = 'out/gpt2-new-nolr'
6
+ eval_interval = 500
7
+ log_interval = 1
8
+ eval_iters = 500
9
+ eval_only = False
10
+ always_save_checkpoint = False
11
+ init_from = 'scratch'
12
+
13
+ wandb_log = False
14
+ wandb_project = 'owt'
15
+ wandb_run_name = 'gpt2'
16
+
17
+ dataset = 'openwebtext'
18
+ gradient_accumulation_steps = 64
19
+ batch_size = 1
20
+ block_size = 1024
21
+
22
+ n_layer = 36
23
+ n_head = 16
24
+ n_embd = 1024
25
+ vocab_size = 50304
26
+ dropout = 0.0
27
+ bias = False
28
+
29
+ learning_rate = 1.5e-4
30
+ max_iters = 50000
31
+ weight_decay = 0.1
32
+ beta1 = 0.9
33
+ beta2 = 0.95
34
+ grad_clip = 1.0
35
+
36
+ decay_lr = False
37
+ warmup_iters = 2000
38
+ lr_decay_iters = 600000
39
+ min_lr = 6e-5
40
+
41
+ backend = 'nccl'
42
+
43
+ device = 'cuda'
44
+ dtype = 'float16'
45
+ compile = True
config/train_gpt2_new_nolr_resume.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model_config = 'gpt2-new'
2
+
3
+ log_file = 'logs/gpt2-new-nolr.log'
4
+
5
+ out_dir = 'out/gpt2-new-nolr'
6
+ eval_interval = 500
7
+ log_interval = 1
8
+ eval_iters = 500
9
+ eval_only = False
10
+ always_save_checkpoint = False
11
+ init_from = 'resume'
12
+
13
+ wandb_log = False
14
+ wandb_project = 'owt'
15
+ wandb_run_name = 'gpt2'
16
+
17
+ dataset = 'openwebtext'
18
+ gradient_accumulation_steps = 64
19
+ batch_size = 1
20
+ block_size = 1024
21
+
22
+ n_layer = 36
23
+ n_head = 16
24
+ n_embd = 1024
25
+ vocab_size = 50304
26
+ dropout = 0.0
27
+ bias = False
28
+
29
+ learning_rate = 1.5e-4
30
+ max_iters = 50000
31
+ weight_decay = 0.1
32
+ beta1 = 0.9
33
+ beta2 = 0.95
34
+ grad_clip = 1.0
35
+
36
+ decay_lr = False
37
+ warmup_iters = 2000
38
+ lr_decay_iters = 600000
39
+ min_lr = 6e-5
40
+
41
+ backend = 'nccl'
42
+
43
+ device = 'cuda'
44
+ dtype = 'float16'
45
+ compile = True
config/train_gpt2_new_resume.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model_config = 'gpt2-new'
2
+
3
+ log_file = 'logs/gpt2-new.log'
4
+
5
+ out_dir = 'out/gpt2-new'
6
+ eval_interval = 500
7
+ log_interval = 1
8
+ eval_iters = 500
9
+ eval_only = False
10
+ always_save_checkpoint = False
11
+ init_from = 'resume'
12
+
13
+ wandb_log = False
14
+ wandb_project = 'owt'
15
+ wandb_run_name = 'gpt2'
16
+
17
+ dataset = 'openwebtext'
18
+ gradient_accumulation_steps = 64
19
+ batch_size = 1
20
+ block_size = 1024
21
+
22
+ n_layer = 36
23
+ n_head = 16
24
+ n_embd = 1024
25
+ vocab_size = 50304
26
+ dropout = 0.0
27
+ bias = False
28
+
29
+ learning_rate = 3e-4
30
+ max_iters = 50000
31
+ weight_decay = 1e-1
32
+ beta1 = 0.9
33
+ beta2 = 0.95
34
+ grad_clip = 1.0
35
+
36
+ decay_lr = True
37
+ warmup_iters = 1000
38
+ lr_decay_iters = 50000
39
+ min_lr = 3e-5
40
+
41
+ backend = 'nccl'
42
+
43
+ device = 'cuda'
44
+ dtype = 'float16'
45
+ compile = True
config/train_gpt2_resume.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model_config = 'gpt2'
2
+
3
+ log_file = 'logs/gpt2.log'
4
+
5
+ out_dir = 'out/gpt2'
6
+ eval_interval = 500
7
+ log_interval = 1
8
+ eval_iters = 500
9
+ eval_only = False
10
+ always_save_checkpoint = False
11
+ init_from = 'resume'
12
+
13
+ wandb_log = False
14
+ wandb_project = 'owt'
15
+ wandb_run_name = 'gpt2'
16
+
17
+ dataset = 'openwebtext'
18
+ gradient_accumulation_steps = 64
19
+ batch_size = 1
20
+ block_size = 1024
21
+
22
+ n_layer = 36
23
+ n_head = 16
24
+ n_embd = 1024
25
+ vocab_size = 50304
26
+ dropout = 0.0
27
+ bias = False
28
+
29
+ learning_rate = 3e-4
30
+ max_iters = 50000
31
+ weight_decay = 1e-1
32
+ beta1 = 0.9
33
+ beta2 = 0.95
34
+ grad_clip = 1.0
35
+
36
+ decay_lr = True
37
+ warmup_iters = 1000
38
+ lr_decay_iters = 50000
39
+ min_lr = 3e-5
40
+
41
+ backend = 'nccl'
42
+
43
+ device = 'cuda'
44
+ dtype = 'float16'
45
+ compile = True
config/train_reflow_1.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ model_config = 'reflow'
3
+
4
+ log_file = 'logs/reflow-1.log'
5
+
6
+ out_dir = 'out/reflow-1'
7
+ eval_interval = 500
8
+ log_interval = 1
9
+ eval_iters = 500
10
+ eval_only = False
11
+ always_save_checkpoint = False
12
+ init_from = 'scratch'
13
+
14
+ wandb_log = False
15
+ wandb_project = 'reflow-1'
16
+ wandb_run_name = 'reflow-1'
17
+
18
+ dataset = 'openwebtext'
19
+ gradient_accumulation_steps = 64
20
+ batch_size = 1
21
+ block_size = 1024
22
+
23
+ n_layer = 32
24
+ n_head = 16
25
+ n_embd = 1024
26
+ n_signals = 1024
27
+ vocab_size = 50304
28
+ dropout = 0.0
29
+ bias = False
30
+
31
+ learning_rate = 1.5e-4
32
+ max_iters = 50000
33
+ weight_decay = 0.1
34
+ beta1 = 0.9
35
+ beta2 = 0.95
36
+ grad_clip = 1.0
37
+
38
+ decay_lr = False
39
+ warmup_iters = 2000
40
+ lr_decay_iters = 600000
41
+ min_lr = 6e-5
42
+
43
+ backend = 'nccl'
44
+ device = 'cuda'
45
+ dtype = 'float16'
46
+ compile = True
config/train_reflow_1_big.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ model_config = 'reflow'
3
+
4
+ log_file = 'logs/reflow-1-big.log'
5
+
6
+ out_dir = 'out/reflow-1-big'
7
+ eval_interval = 500
8
+ log_interval = 1
9
+ eval_iters = 500
10
+ eval_only = False
11
+ always_save_checkpoint = False
12
+ init_from = 'scratch'
13
+
14
+ wandb_log = False
15
+ wandb_project = 'reflow-1-big'
16
+ wandb_run_name = 'reflow-1-big'
17
+
18
+ dataset = 'openwebtext'
19
+ gradient_accumulation_steps = 64
20
+ batch_size = 1
21
+ block_size = 1024
22
+
23
+ n_layer = 36
24
+ n_head = 16
25
+ n_embd = 1024
26
+ n_signals = 1024
27
+ vocab_size = 50304
28
+ dropout = 0.0
29
+ bias = False
30
+
31
+ learning_rate = 3e-4
32
+ max_iters = 50000
33
+ weight_decay = 1e-1
34
+ beta1 = 0.9
35
+ beta2 = 0.95
36
+ grad_clip = 1.0
37
+
38
+ decay_lr = True
39
+ warmup_iters = 1000
40
+ lr_decay_iters = 50000
41
+ min_lr = 3e-5
42
+
43
+ backend = 'nccl'
44
+ device = 'cuda'
45
+ dtype = 'float16'
46
+ compile = True
config/train_reflow_1_big_resume.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ model_config = 'reflow'
3
+
4
+ log_file = 'logs/reflow-1-big.log'
5
+
6
+ out_dir = 'out/reflow-1-big'
7
+ eval_interval = 500
8
+ log_interval = 1
9
+ eval_iters = 500
10
+ eval_only = False
11
+ always_save_checkpoint = False
12
+ init_from = 'resume'
13
+
14
+ wandb_log = False
15
+ wandb_project = 'reflow-1-big'
16
+ wandb_run_name = 'reflow-1-big'
17
+
18
+ dataset = 'openwebtext'
19
+ gradient_accumulation_steps = 64
20
+ batch_size = 1
21
+ block_size = 1024
22
+
23
+ n_layer = 36
24
+ n_head = 16
25
+ n_embd = 1024
26
+ n_signals = 1024
27
+ vocab_size = 50304
28
+ dropout = 0.0
29
+ bias = False
30
+
31
+ learning_rate = 3e-4
32
+ max_iters = 50000
33
+ weight_decay = 1e-1
34
+ beta1 = 0.9
35
+ beta2 = 0.95
36
+ grad_clip = 1.0
37
+
38
+ decay_lr = True
39
+ warmup_iters = 1000
40
+ lr_decay_iters = 50000
41
+ min_lr = 3e-5
42
+
43
+ backend = 'nccl'
44
+ device = 'cuda'
45
+ dtype = 'float16'
46
+ compile = True
config/train_reflow_1_lite.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ model_config = 'reflow-lite'
3
+
4
+ log_file = 'logs/reflow-1-lite.log'
5
+
6
+ out_dir = 'out/reflow-1-lite'
7
+ eval_interval = 500
8
+ log_interval = 1
9
+ eval_iters = 500
10
+ eval_only = False
11
+ always_save_checkpoint = False
12
+ init_from = 'scratch'
13
+
14
+ wandb_log = False
15
+ wandb_project = 'reflow-lite'
16
+ wandb_run_name = 'reflow-lite'
17
+
18
+ dataset = 'openwebtext'
19
+ gradient_accumulation_steps = 40
20
+ batch_size = 1
21
+ block_size = 1024
22
+
23
+ n_layer = 32
24
+ n_head = 16
25
+ n_embd = 1024
26
+ n_signals = 1024
27
+ vocab_size = 50304
28
+ dropout = 0.0
29
+ bias = False
30
+
31
+ mlp_expansion_ratio = 2.66
32
+ n_kv_head = 4
33
+
34
+ learning_rate = 1e-4
35
+ max_iters = 100000
36
+ weight_decay = 0.1
37
+ beta1 = 0.9
38
+ beta2 = 0.95
39
+ grad_clip = 1.0
40
+
41
+ decay_lr = True
42
+ warmup_iters = 2000
43
+ lr_decay_iters = 100000
44
+ min_lr = 1e-5
45
+
46
+ backend = 'nccl'
47
+ device = 'cuda'
48
+ dtype = 'bfloat16'
49
+ compile = True
config/train_reflow_1_lite_resume.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ model_config = 'reflow-lite'
3
+
4
+ log_file = 'logs/reflow-1-lite.log'
5
+
6
+ out_dir = 'out/reflow-1-lite'
7
+ eval_interval = 500
8
+ log_interval = 1
9
+ eval_iters = 500
10
+ eval_only = False
11
+ always_save_checkpoint = False
12
+ init_from = 'resume'
13
+
14
+ wandb_log = False
15
+ wandb_project = 'reflow-lite'
16
+ wandb_run_name = 'reflow-lite'
17
+
18
+ dataset = 'openwebtext'
19
+ gradient_accumulation_steps = 40
20
+ batch_size = 1
21
+ block_size = 1024
22
+
23
+ n_layer = 32
24
+ n_head = 16
25
+ n_embd = 1024
26
+ n_signals = 1024
27
+ vocab_size = 50304
28
+ dropout = 0.0
29
+ bias = False
30
+
31
+ mlp_expansion_ratio = 2.66
32
+ n_kv_head = 4
33
+
34
+ learning_rate = 1e-4
35
+ max_iters = 100000
36
+ weight_decay = 0.1
37
+ beta1 = 0.9
38
+ beta2 = 0.95
39
+ grad_clip = 1.0
40
+
41
+ decay_lr = True
42
+ warmup_iters = 2000
43
+ lr_decay_iters = 100000
44
+ min_lr = 1e-5
45
+
46
+ backend = 'nccl'
47
+ device = 'cuda'
48
+ dtype = 'bfloat16'
49
+ compile = True
config/train_reflow_1_resume.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ model_config = 'reflow'
3
+
4
+ log_file = 'logs/reflow-1.log'
5
+
6
+ out_dir = 'out/reflow-1'
7
+ eval_interval = 500
8
+ log_interval = 1
9
+ eval_iters = 500
10
+ eval_only = False
11
+ always_save_checkpoint = False
12
+ init_from = 'resume'
13
+
14
+ wandb_log = False
15
+ wandb_project = 'reflow-1'
16
+ wandb_run_name = 'reflow-1'
17
+
18
+ dataset = 'openwebtext'
19
+ gradient_accumulation_steps = 64
20
+ batch_size = 1
21
+ block_size = 1024
22
+
23
+ n_layer = 32
24
+ n_head = 16
25
+ n_embd = 1024
26
+ n_signals = 1024
27
+ vocab_size = 50304
28
+ dropout = 0.0
29
+ bias = False
30
+
31
+ learning_rate = 1.5e-4
32
+ max_iters = 50000
33
+ weight_decay = 0.1
34
+ beta1 = 0.9
35
+ beta2 = 0.95
36
+ grad_clip = 1.0
37
+
38
+ decay_lr = False
39
+ warmup_iters = 2000
40
+ lr_decay_iters = 600000
41
+ min_lr = 6e-5
42
+
43
+ backend = 'nccl'
44
+ device = 'cuda'
45
+ dtype = 'float16'
46
+ compile = True
config/train_reflow_1_small.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ model_config = 'reflow'
3
+
4
+ log_file = 'logs/reflow-1-small.log'
5
+
6
+ out_dir = 'out/reflow-1-small'
7
+ eval_interval = 500
8
+ log_interval = 1
9
+ eval_iters = 500
10
+ eval_only = False
11
+ always_save_checkpoint = False
12
+ init_from = 'scratch'
13
+
14
+ wandb_log = False
15
+ wandb_project = 'reflow-1-small'
16
+ wandb_run_name = 'reflow-1-small'
17
+
18
+ dataset = 'openwebtext'
19
+ gradient_accumulation_steps = 64
20
+ batch_size = 1
21
+ block_size = 1024
22
+
23
+ n_layer = 6
24
+ n_head = 8
25
+ n_embd = 512
26
+ n_signals = 512
27
+ vocab_size = 50304
28
+ dropout = 0.0
29
+ bias = False
30
+
31
+ learning_rate = 1.5e-4
32
+ max_iters = 50000
33
+ weight_decay = 0.1
34
+ beta1 = 0.9
35
+ beta2 = 0.95
36
+ grad_clip = 1.0
37
+
38
+ decay_lr = False
39
+ warmup_iters = 2000
40
+ lr_decay_iters = 600000
41
+ min_lr = 6e-5
42
+
43
+ backend = 'nccl'
44
+ device = 'cuda'
45
+ dtype = 'float16'
46
+ compile = True
config/train_reflow_1_small_resume.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ model_config = 'reflow'
3
+
4
+ log_file = 'logs/reflow-1-small.log'
5
+
6
+ out_dir = 'out/reflow-1-small'
7
+ eval_interval = 500
8
+ log_interval = 1
9
+ eval_iters = 500
10
+ eval_only = False
11
+ always_save_checkpoint = False
12
+ init_from = 'resume'
13
+
14
+ wandb_log = False
15
+ wandb_project = 'reflow-1-small'
16
+ wandb_run_name = 'reflow-1-small'
17
+
18
+ dataset = 'openwebtext'
19
+ gradient_accumulation_steps = 64
20
+ batch_size = 1
21
+ block_size = 1024
22
+
23
+ n_layer = 6
24
+ n_head = 8
25
+ n_embd = 512
26
+ n_signals = 512
27
+ vocab_size = 50304
28
+ dropout = 0.0
29
+ bias = False
30
+
31
+ learning_rate = 1.5e-4
32
+ max_iters = 50000
33
+ weight_decay = 0.1
34
+ beta1 = 0.9
35
+ beta2 = 0.95
36
+ grad_clip = 1.0
37
+
38
+ decay_lr = False
39
+ warmup_iters = 2000
40
+ lr_decay_iters = 600000
41
+ min_lr = 6e-5
42
+
43
+ backend = 'nccl'
44
+ device = 'cuda'
45
+ dtype = 'float16'
46
+ compile = True
config/train_reflow_1_small_sp.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ model_config = 'reflow'
3
+
4
+ log_file = 'logs/reflow-1-small-sp.log'
5
+
6
+ out_dir = 'out/reflow-1-small-sp'
7
+ eval_interval = 500
8
+ log_interval = 1
9
+ eval_iters = 500
10
+ eval_only = False
11
+ always_save_checkpoint = False
12
+ init_from = 'scratch'
13
+
14
+ wandb_log = False
15
+ wandb_project = 'reflow-1-small-sp'
16
+ wandb_run_name = 'reflow-1-small-sp'
17
+
18
+ dataset = 'openwebtext'
19
+ gradient_accumulation_steps = 64
20
+ batch_size = 1
21
+ block_size = 1024
22
+
23
+ n_layer = 12
24
+ n_head = 6
25
+ n_embd = 384
26
+ n_signals = 384
27
+ vocab_size = 50304
28
+ dropout = 0.0
29
+ bias = False
30
+
31
+ learning_rate = 1.5e-4
32
+ max_iters = 50000
33
+ weight_decay = 0.1
34
+ beta1 = 0.9
35
+ beta2 = 0.95
36
+ grad_clip = 1.0
37
+
38
+ decay_lr = False
39
+ warmup_iters = 2000
40
+ lr_decay_iters = 600000
41
+ min_lr = 6e-5
42
+
43
+ backend = 'nccl'
44
+ device = 'cuda'
45
+ dtype = 'float16'
46
+ compile = True
config/train_reflow_1_small_sp_resume.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ model_config = 'reflow'
3
+
4
+ log_file = 'logs/reflow-1-small-sp.log'
5
+
6
+ out_dir = 'out/reflow-1-small-sp'
7
+ eval_interval = 500
8
+ log_interval = 1
9
+ eval_iters = 500
10
+ eval_only = False
11
+ always_save_checkpoint = False
12
+ init_from = 'resume'
13
+
14
+ wandb_log = False
15
+ wandb_project = 'reflow-1-small-sp'
16
+ wandb_run_name = 'reflow-1-small-sp'
17
+
18
+ dataset = 'openwebtext'
19
+ gradient_accumulation_steps = 64
20
+ batch_size = 1
21
+ block_size = 1024
22
+
23
+ n_layer = 12
24
+ n_head = 6
25
+ n_embd = 384
26
+ n_signals = 384
27
+ vocab_size = 50304
28
+ dropout = 0.0
29
+ bias = False
30
+
31
+ learning_rate = 1.5e-4
32
+ max_iters = 50000
33
+ weight_decay = 0.1
34
+ beta1 = 0.9
35
+ beta2 = 0.95
36
+ grad_clip = 1.0
37
+
38
+ decay_lr = False
39
+ warmup_iters = 2000
40
+ lr_decay_iters = 600000
41
+ min_lr = 6e-5
42
+
43
+ backend = 'nccl'
44
+ device = 'cuda'
45
+ dtype = 'float16'
46
+ compile = True
config/train_reflow_1_topk.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ model_config = 'reflow-topk'
3
+
4
+ log_file = 'logs/reflow-1-topk.log'
5
+
6
+ out_dir = 'out/reflow-1-topk'
7
+ eval_interval = 500
8
+ log_interval = 1
9
+ eval_iters = 500
10
+ eval_only = False
11
+ always_save_checkpoint = False
12
+ init_from = 'scratch'
13
+
14
+ wandb_log = False
15
+ wandb_project = 'reflow-1-topk'
16
+ wandb_run_name = 'reflow-1-topk'
17
+
18
+ dataset = 'openwebtext'
19
+ gradient_accumulation_steps = 64
20
+ batch_size = 1
21
+ block_size = 1024
22
+
23
+ n_layer = 6
24
+ n_head = 8
25
+ n_embd = 512
26
+ n_signals = 512
27
+ vocab_size = 50304
28
+ dropout = 0.0
29
+ bias = False
30
+ max_active_signals = 32
31
+ sparsity_penalty = 1e-3
32
+
33
+ learning_rate = 3e-4
34
+ max_iters = 50000
35
+ weight_decay = 1e-1
36
+ beta1 = 0.9
37
+ beta2 = 0.95
38
+ grad_clip = 1.0
39
+
40
+ decay_lr = True
41
+ warmup_iters = 1000
42
+ lr_decay_iters = 50000
43
+ min_lr = 3e-5
44
+
45
+ backend = 'nccl'
46
+ device = 'cuda'
47
+ dtype = 'float16'
48
+ compile = True
config/train_reflow_1_topk_big.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ model_config = 'reflow-topk'
3
+
4
+ log_file = 'logs/reflow-1-topk-big.log'
5
+
6
+ out_dir = 'out/reflow-1-topk-big'
7
+ eval_interval = 500
8
+ log_interval = 1
9
+ eval_iters = 500
10
+ eval_only = False
11
+ always_save_checkpoint = False
12
+ init_from = 'scratch'
13
+
14
+ wandb_log = False
15
+ wandb_project = 'reflow-1-topk-big'
16
+ wandb_run_name = 'reflow-1-topk-big'
17
+
18
+ dataset = 'openwebtext'
19
+ gradient_accumulation_steps = 64
20
+ batch_size = 1
21
+ block_size = 1024
22
+
23
+ n_layer = 36
24
+ n_head = 16
25
+ n_embd = 1024
26
+ n_signals = 1024
27
+ vocab_size = 50304
28
+ dropout = 0.0
29
+ bias = False
30
+ max_active_signals = 64
31
+ sparsity_penalty = 1e-3
32
+
33
+ learning_rate = 3e-4
34
+ max_iters = 50000
35
+ weight_decay = 1e-1
36
+ beta1 = 0.9
37
+ beta2 = 0.95
38
+ grad_clip = 1.0
39
+
40
+ decay_lr = True
41
+ warmup_iters = 1000
42
+ lr_decay_iters = 50000
43
+ min_lr = 3e-5
44
+
45
+ backend = 'nccl'
46
+ device = 'cuda'
47
+ dtype = 'float16'
48
+ compile = True
config/train_reflow_1_topk_big_resume.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ model_config = 'reflow-topk'
3
+
4
+ log_file = 'logs/reflow-1-topk-big.log'
5
+
6
+ out_dir = 'out/reflow-1-topk-big'
7
+ eval_interval = 500
8
+ log_interval = 1
9
+ eval_iters = 500
10
+ eval_only = False
11
+ always_save_checkpoint = False
12
+ init_from = 'resume'
13
+
14
+ wandb_log = False
15
+ wandb_project = 'reflow-1-topk-big'
16
+ wandb_run_name = 'reflow-1-topk-big'
17
+
18
+ dataset = 'openwebtext'
19
+ gradient_accumulation_steps = 64
20
+ batch_size = 1
21
+ block_size = 1024
22
+
23
+ n_layer = 36
24
+ n_head = 16
25
+ n_embd = 1024
26
+ n_signals = 1024
27
+ vocab_size = 50304
28
+ dropout = 0.0
29
+ bias = False
30
+ max_active_signals = 64
31
+ sparsity_penalty = 1e-3
32
+
33
+ learning_rate = 3e-4
34
+ max_iters = 50000
35
+ weight_decay = 1e-1
36
+ beta1 = 0.9
37
+ beta2 = 0.95
38
+ grad_clip = 1.0
39
+
40
+ decay_lr = True
41
+ warmup_iters = 1000
42
+ lr_decay_iters = 50000
43
+ min_lr = 3e-5
44
+
45
+ backend = 'nccl'
46
+ device = 'cuda'
47
+ dtype = 'float16'
48
+ compile = True
config/train_reflow_base.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ model_config = 'reflow'
3
+
4
+ out_dir = 'out-reflow'
5
+ eval_interval = 250
6
+ log_interval = 10
7
+ eval_iters = 200
8
+ eval_only = False
9
+ always_save_checkpoint = False
10
+ init_from = 'scratch'
11
+
12
+ wandb_log = False
13
+ wandb_project = 'reflow'
14
+ wandb_run_name = 'reflow'
15
+
16
+ dataset = 'openwebtext'
17
+ gradient_accumulation_steps = 64
18
+ batch_size = 1
19
+ block_size = 1024
20
+
21
+ n_layer = 32
22
+ n_head = 16
23
+ n_embd = 1024
24
+ n_signals = 1024
25
+ dropout = 0.0
26
+ bias = False
27
+
28
+ learning_rate = 1.5e-4
29
+ max_iters = 50000
30
+ weight_decay = 0.1
31
+ beta1 = 0.9
32
+ beta2 = 0.95
33
+ grad_clip = 1.0
34
+
35
+ decay_lr = False
36
+ warmup_iters = 2000
37
+ lr_decay_iters = 600000
38
+ min_lr = 6e-5
39
+
40
+ backend = 'nccl'
41
+ device = 'cuda'
42
+ dtype = 'float16'
43
+ compile = True
config/train_resume.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import time
3
+
4
+ model_config = 'gpt2'
5
+
6
+ out_dir = 'out-web'
7
+ eval_interval = 50
8
+ log_interval = 1
9
+ eval_iters = 40
10
+ eval_only = False
11
+ always_save_checkpoint = False
12
+
13
+ wandb_log = True
14
+ wandb_project = 'resume'
15
+ wandb_run_name = 'resume-' + str(time.time())
16
+
17
+ dataset = 'openwebtext'
18
+ gradient_accumulation_steps = 64
19
+ batch_size = 1
20
+ block_size = 1024
21
+
22
+ n_layer = 12
23
+ n_head = 12
24
+ n_embd = 768
25
+ dropout = 0.0
26
+ bias = False
27
+
28
+ learning_rate = 1e-6
29
+ max_iters = 20
30
+ weight_decay = 0.1
31
+ beta1 = 0.9
32
+ beta2 = 0.95
33
+ grad_clip = 1.0
34
+
35
+ decay_lr = True
36
+ warmup_iters = 0
37
+ lr_decay_iters = 20
38
+ min_lr = 1e-6
39
+
40
+ backend = 'nccl'
41
+ device = 'cuda'
42
+ dtype = 'bfloat16'
43
+ compile = True
44
+ init_from = 'resume'
config/train_sft.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ model_config = 'gpt2'
3
+
4
+ out_dir = 'out-sft'
5
+ base_model_dir = 'out'
6
+ init_from = 'finetune'
7
+
8
+ eval_interval = 250
9
+ log_interval = 1
10
+ eval_iters = 40
11
+ eval_only = False
12
+ always_save_checkpoint = True
13
+
14
+ dataset = 'alpaca'
15
+ gradient_accumulation_steps = 4
16
+ batch_size = 4
17
+ block_size = 1024
18
+
19
+ n_layer = 12
20
+ n_head = 12
21
+ n_embd = 768
22
+ dropout = 0.1
23
+ bias = False
24
+
25
+ learning_rate = 2e-5
26
+ max_iters = 3000
27
+ weight_decay = 0.1
28
+ beta1 = 0.9
29
+ beta2 = 0.95
30
+ grad_clip = 1.0
31
+
32
+ decay_lr = True
33
+ warmup_iters = 100
34
+ lr_decay_iters = 3000
35
+ min_lr = 1e-6
36
+
37
+ sft_masking = True
38
+
39
+ backend = 'nccl'
40
+ device = 'cuda'
41
+ dtype = 'bfloat16'
42
+ compile = True
config/train_shakespeare_char.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ model_config = 'gpt2'
3
+
4
+ out_dir = 'out-shakespeare-char'
5
+ eval_interval = 250
6
+ log_interval = 10
7
+ eval_iters = 200
8
+ eval_only = False
9
+ always_save_checkpoint = False
10
+
11
+ wandb_log = False
12
+ wandb_project = 'shakespeare-char'
13
+ wandb_run_name = 'mini-gpt'
14
+
15
+ dataset = 'shakespeare_char'
16
+ gradient_accumulation_steps = 1
17
+ batch_size = 64
18
+ block_size = 256
19
+
20
+ n_layer = 6
21
+ n_head = 6
22
+ n_embd = 384
23
+ dropout = 0.2
24
+ bias = False
25
+
26
+ learning_rate = 1e-3
27
+ max_iters = 5000
28
+ weight_decay = 1e-1
29
+ beta1 = 0.9
30
+ beta2 = 0.99
31
+ grad_clip = 1.0
32
+
33
+ decay_lr = True
34
+ warmup_iters = 100
35
+ lr_decay_iters = 5000
36
+ min_lr = 1e-4
37
+
38
+ backend = 'nccl'
39
+ device = 'cuda'
40
+ dtype = 'bfloat16'
41
+ compile = True
42
+ init_from = 'scratch'