Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .gitattributes +14 -35
- .gitignore +11 -0
- LICENSE +21 -0
- README.md +172 -0
- README_CN.md +199 -0
- bench.py +182 -0
- check.py +122 -0
- config/base.py +41 -0
- config/bench_base.py +14 -0
- config/bench_gpt2.py +18 -0
- config/eval_gpt2.py +5 -0
- config/eval_gpt2_large.py +5 -0
- config/eval_gpt2_medium.py +5 -0
- config/eval_gpt2_xl.py +5 -0
- config/finetune_shakespeare.py +45 -0
- config/sample_base.py +12 -0
- config/sample_gpt2.py +17 -0
- config/sample_gpt2_new.py +17 -0
- config/sample_gpt2_new_nolr.py +17 -0
- config/sample_reflow_1.py +17 -0
- config/sample_reflow_1_big.py +17 -0
- config/sample_reflow_1_lite.py +17 -0
- config/sample_reflow_1_small.py +17 -0
- config/sample_reflow_1_small_sp.py +17 -0
- config/sample_reflow_1_topk.py +17 -0
- config/sample_reflow_1_topk_big.py +17 -0
- config/sample_sft_reflow_1.py +17 -0
- config/train_gpt2.py +45 -0
- config/train_gpt2_new.py +45 -0
- config/train_gpt2_new_nolr.py +45 -0
- config/train_gpt2_new_nolr_resume.py +45 -0
- config/train_gpt2_new_resume.py +45 -0
- config/train_gpt2_resume.py +45 -0
- config/train_reflow_1.py +46 -0
- config/train_reflow_1_big.py +46 -0
- config/train_reflow_1_big_resume.py +46 -0
- config/train_reflow_1_lite.py +49 -0
- config/train_reflow_1_lite_resume.py +49 -0
- config/train_reflow_1_resume.py +46 -0
- config/train_reflow_1_small.py +46 -0
- config/train_reflow_1_small_resume.py +46 -0
- config/train_reflow_1_small_sp.py +46 -0
- config/train_reflow_1_small_sp_resume.py +46 -0
- config/train_reflow_1_topk.py +48 -0
- config/train_reflow_1_topk_big.py +48 -0
- config/train_reflow_1_topk_big_resume.py +48 -0
- config/train_reflow_base.py +43 -0
- config/train_resume.py +44 -0
- config/train_sft.py +42 -0
- config/train_shakespeare_char.py +42 -0
.gitattributes
CHANGED
|
@@ -1,35 +1,14 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
*.
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
| 1 |
+
# Override jupyter in Github language stats for more accurate estimate of repo code languages
|
| 2 |
+
# reference: https://github.com/github/linguist/blob/master/docs/overrides.md#generated-code
|
| 3 |
+
*.ipynb linguist-generated
|
| 4 |
+
out/gpt2/ckpt.pt filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
out/gpt2-new/ckpt.pt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
out/reflow-1/ckpt.pt filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
out/reflow-1-big/ckpt.pt filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
out/reflow-1-lite/ckpt.pt filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
out/reflow-1-small/ckpt.pt filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
out/reflow-1-small-sp/ckpt.pt filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
out/reflow-1-topk/ckpt.pt filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
out/reflow-1-topk-big/ckpt.pt filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
paper/paper-cn.pdf filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
paper/paper.pdf filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.gitignore
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.DS_Store
|
| 2 |
+
.idea
|
| 3 |
+
.ipynb_checkpoints/
|
| 4 |
+
.vscode
|
| 5 |
+
__pycache__/
|
| 6 |
+
*.bin
|
| 7 |
+
*.pkl
|
| 8 |
+
*.pyc
|
| 9 |
+
input.txt
|
| 10 |
+
env/
|
| 11 |
+
venv/
|
LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MIT License
|
| 2 |
+
|
| 3 |
+
Copyright (c) 2022 Andrej Karpathy
|
| 4 |
+
|
| 5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
+
of this software and associated documentation files (the "Software"), to deal
|
| 7 |
+
in the Software without restriction, including without limitation the rights
|
| 8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
+
copies of the Software, and to permit persons to whom the Software is
|
| 10 |
+
furnished to do so, subject to the following conditions:
|
| 11 |
+
|
| 12 |
+
The above copyright notice and this permission notice shall be included in all
|
| 13 |
+
copies or substantial portions of the Software.
|
| 14 |
+
|
| 15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
| 21 |
+
SOFTWARE.
|
README.md
ADDED
|
@@ -0,0 +1,172 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: mit
|
| 3 |
+
language:
|
| 4 |
+
- en
|
| 5 |
+
- zh
|
| 6 |
+
tags:
|
| 7 |
+
- transformer
|
| 8 |
+
- interpretability
|
| 9 |
+
- mechanistic-interpretability
|
| 10 |
+
- language-model
|
| 11 |
+
- signal-decomposition
|
| 12 |
+
- sparse-representations
|
| 13 |
+
- pytorch
|
| 14 |
+
datasets:
|
| 15 |
+
- openwebtext
|
| 16 |
+
pipeline_tag: text-generation
|
| 17 |
+
---
|
| 18 |
+
|
| 19 |
+
# reFlow
|
| 20 |
+
|
| 21 |
+
**A Metal Soul In My Hand** — A feature-decoupled Transformer architecture with native interpretability.
|
| 22 |
+
|
| 23 |
+
reFlow reconstructs the traditional full-rank embedding matrix into the product of a **Recipe Matrix** $W_{recipe} \in \mathbb{R}^{V \times S}$ and a **Signal Basis Matrix** $W_{basis} \in \mathbb{R}^{S \times d}$, forcing the model to maintain a set of continuous, low-redundancy signal bases in latent space. A dynamic vocabulary matrix $W_{vocab} = W_{recipe} \times W_{basis}$ is reconstructed in real-time at each forward pass, serving simultaneously as both the embedding matrix and the output projection matrix.
|
| 24 |
+
|
| 25 |
+
> **Paper**: [English (PDF)](./paper/paper.pdf) | [中文 (PDF)](./paper/paper-cn.pdf)
|
| 26 |
+
|
| 27 |
+
## Project Structure
|
| 28 |
+
|
| 29 |
+
```
|
| 30 |
+
reFlow/
|
| 31 |
+
├── train.py # Training script (single GPU / DDP)
|
| 32 |
+
├── sample.py # Text generation from trained models
|
| 33 |
+
├── experiment.py # 12-experiment interpretability suite (Chinese)
|
| 34 |
+
├── experiment_en.py # 12-experiment interpretability suite (English)
|
| 35 |
+
├── check.py # Checkpoint parameter inspector
|
| 36 |
+
├── bench.py # Performance benchmarking
|
| 37 |
+
├── models/
|
| 38 |
+
│ ├── gpt2.py # Standard GPT-2 baseline
|
| 39 |
+
│ ├── gpt2-new.py # Modernized GPT-2 (RoPE + SwiGLU + RMSNorm)
|
| 40 |
+
│ ├── reflow.py # reFlow base architecture
|
| 41 |
+
│ ├── reflow-topk.py # reFlow with ReLU + Top-K hard sparsity
|
| 42 |
+
│ └── reflow-lite.py # reFlow with GQA + reduced MLP
|
| 43 |
+
├── config/ # Training / sampling / eval configurations
|
| 44 |
+
├── data/
|
| 45 |
+
│ ├── openwebtext/ # OpenWebText dataset preparation
|
| 46 |
+
│ └── sft-lima/ # LIMA SFT dataset preparation
|
| 47 |
+
└── out/ # Checkpoints and experiment reports
|
| 48 |
+
```
|
| 49 |
+
|
| 50 |
+
## Installation
|
| 51 |
+
|
| 52 |
+
### Prerequisites
|
| 53 |
+
|
| 54 |
+
- Python 3.10+
|
| 55 |
+
- CUDA-compatible GPU (tested on Tesla T4 x4)
|
| 56 |
+
|
| 57 |
+
### 1. PyTorch (CUDA 12.8)
|
| 58 |
+
|
| 59 |
+
```bash
|
| 60 |
+
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
|
| 61 |
+
```
|
| 62 |
+
|
| 63 |
+
> Adjust the CUDA version in the URL to match your driver. See [PyTorch Get Started](https://pytorch.org/get-started/locally/).
|
| 64 |
+
|
| 65 |
+
### 2. Core Dependencies
|
| 66 |
+
|
| 67 |
+
```bash
|
| 68 |
+
pip install datasets tiktoken wandb tqdm
|
| 69 |
+
```
|
| 70 |
+
|
| 71 |
+
### 3. Experiment Suite Dependencies
|
| 72 |
+
|
| 73 |
+
The interpretability experiments (`experiment.py`) require additional packages:
|
| 74 |
+
|
| 75 |
+
```bash
|
| 76 |
+
pip install numpy matplotlib seaborn scikit-learn scipy adjustText
|
| 77 |
+
```
|
| 78 |
+
|
| 79 |
+
### Quick Install (All-in-One)
|
| 80 |
+
|
| 81 |
+
```bash
|
| 82 |
+
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
|
| 83 |
+
pip install datasets tiktoken wandb tqdm numpy matplotlib seaborn scikit-learn scipy adjustText
|
| 84 |
+
```
|
| 85 |
+
|
| 86 |
+
## Data Preparation
|
| 87 |
+
|
| 88 |
+
### OpenWebText
|
| 89 |
+
|
| 90 |
+
```bash
|
| 91 |
+
python data/openwebtext/prepare.py
|
| 92 |
+
```
|
| 93 |
+
|
| 94 |
+
This downloads the OpenWebText corpus (~54 GB) and tokenizes it with the GPT-2 BPE tokenizer. Output: `data/openwebtext/train.bin` (~17 GB, ~9B tokens) and `val.bin`.
|
| 95 |
+
|
| 96 |
+
## Training
|
| 97 |
+
|
| 98 |
+
All configurations are in `config/`. No CLI overrides — all hyperparameters must be set in the config file.
|
| 99 |
+
|
| 100 |
+
### Single GPU
|
| 101 |
+
|
| 102 |
+
```bash
|
| 103 |
+
python train.py config/train_reflow_1.py
|
| 104 |
+
```
|
| 105 |
+
|
| 106 |
+
### Multi-GPU (DDP)
|
| 107 |
+
|
| 108 |
+
```bash
|
| 109 |
+
torchrun --standalone --nproc_per_node=4 train.py config/train_reflow_1.py
|
| 110 |
+
```
|
| 111 |
+
|
| 112 |
+
### Available Training Configs
|
| 113 |
+
|
| 114 |
+
| Config | Architecture | Layers | Params | Notes |
|
| 115 |
+
|--------|-------------|--------|--------|-------|
|
| 116 |
+
| `train_gpt2.py` | GPT-2 | 36 | 505.62M | Standard baseline |
|
| 117 |
+
| `train_gpt2_new.py` | GPT-2-New | 36 | 514.01M | + RoPE, SwiGLU, RMSNorm |
|
| 118 |
+
| `train_reflow_1.py` | reFlow | 32 | 463.67M | Base reFlow, constant lr |
|
| 119 |
+
| `train_reflow_1_big.py` | reFlow | 36 | 515.06M | lr decay, for interpretability |
|
| 120 |
+
| `train_reflow_1_topk_big.py` | reFlow-TopK | 36 | 515.06M | + ReLU + Top-64 sparsity |
|
| 121 |
+
| `train_reflow_1_lite.py` | reFlow-Lite | 32 | 413.34M | + GQA, reduced MLP |
|
| 122 |
+
| `train_reflow_1_small.py` | reFlow | 6 | 46.47M | Small-scale validation |
|
| 123 |
+
|
| 124 |
+
### Resume Training
|
| 125 |
+
|
| 126 |
+
Append `_resume` to the config name (e.g., `train_reflow_1_big_resume.py`).
|
| 127 |
+
|
| 128 |
+
## Text Generation
|
| 129 |
+
|
| 130 |
+
```bash
|
| 131 |
+
python sample.py config/sample_reflow_1.py
|
| 132 |
+
```
|
| 133 |
+
|
| 134 |
+
Edit the config file to change the prompt, temperature, top-k, etc.
|
| 135 |
+
|
| 136 |
+
## Interpretability Experiments
|
| 137 |
+
|
| 138 |
+
The experiment suite runs 12 analyses on a trained reFlow model. Both Chinese and English versions are available:
|
| 139 |
+
|
| 140 |
+
```bash
|
| 141 |
+
python experiment_en.py config/train_reflow_1_big.py # English
|
| 142 |
+
python experiment.py config/train_reflow_1_big.py # Chinese
|
| 143 |
+
```
|
| 144 |
+
|
| 145 |
+
An interactive menu will appear:
|
| 146 |
+
|
| 147 |
+
| # | Experiment | Group |
|
| 148 |
+
|---|-----------|-------|
|
| 149 |
+
| 1 | Recipe Atlas — recipe-space nearest neighbors | A. Signal Identity |
|
| 150 |
+
| 2 | Sparsity Profile — activation sparsity analysis | A. Signal Identity |
|
| 151 |
+
| 3 | Basis Geometry — singular value & effective rank | A. Signal Identity |
|
| 152 |
+
| 4 | Semantic Galaxy — PCA clustering visualization | B. Semantic Properties |
|
| 153 |
+
| 5 | Semantic Algebra — vector arithmetic (king − man + woman = queen) | B. Semantic Properties |
|
| 154 |
+
| 6 | Typo Resilience — robustness to spelling errors | B. Semantic Properties |
|
| 155 |
+
| 7 | Layer Evolution — per-layer probability crystallization | C. Mechanistic Analysis |
|
| 156 |
+
| 8 | Signal Flow — signal activation heatmaps across layers | C. Mechanistic Analysis |
|
| 157 |
+
| 9 | Causal Ablation — progressive signal knockout curves | C. Mechanistic Analysis |
|
| 158 |
+
| 10 | Emotion Surgery — sentiment steering via signal injection | D. Control & Steering |
|
| 159 |
+
| 11 | Concept Inception — binary-search concept implantation | D. Control & Steering |
|
| 160 |
+
| 12 | Genetic Hijack — global recipe matrix manipulation | D. Control & Steering |
|
| 161 |
+
|
| 162 |
+
Enter `all` to run all experiments, or specific numbers (e.g., `1 3 5`). Reports are saved to `out/<model>/audit_reports/`.
|
| 163 |
+
|
| 164 |
+
## Checkpoint Inspection
|
| 165 |
+
|
| 166 |
+
```bash
|
| 167 |
+
python check.py config/train_reflow_1.py out/reflow-1/ckpt.pt
|
| 168 |
+
```
|
| 169 |
+
|
| 170 |
+
## License
|
| 171 |
+
|
| 172 |
+
MIT License. Based on [nanoGPT](https://github.com/karpathy/nanoGPT) by Andrej Karpathy.
|
README_CN.md
ADDED
|
@@ -0,0 +1,199 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# reFlow
|
| 2 |
+
|
| 3 |
+
**A Metal Soul In My Hand** — 具备原生可解释性的特征解耦 Transformer 架构。
|
| 4 |
+
|
| 5 |
+
reFlow 将传统全秩词嵌入矩阵重构为**配方矩阵** $W_{recipe} \in \mathbb{R}^{V \times S}$ 与**信号基底矩阵** $W_{basis} \in \mathbb{R}^{S \times d}$ 的乘积形式,迫使模型在潜空间中维护一组连续、低冗余的信号基底。动态词表矩阵 $W_{vocab} = W_{recipe} \times W_{basis}$ 在每次前向传播中实时重构,同时作为嵌入矩阵与输出投影矩阵使用。
|
| 6 |
+
|
| 7 |
+
> **论文**: [English (PDF)](./paper/paper.pdf) | [中文 (PDF)](./paper/paper-cn.pdf)
|
| 8 |
+
|
| 9 |
+
## 项目结构
|
| 10 |
+
|
| 11 |
+
```
|
| 12 |
+
reFlow/
|
| 13 |
+
├── train.py # 训练脚本(单卡 / DDP 多卡)
|
| 14 |
+
├── sample.py # 从训练好的模型生成文本
|
| 15 |
+
├── experiment.py # 12 项可解释性实验套件(中文)
|
| 16 |
+
├── experiment_en.py # 12 项可解释性实验套件(English)
|
| 17 |
+
├── check.py # Checkpoint 参数检查工具
|
| 18 |
+
├── bench.py # 性能基准测试
|
| 19 |
+
├── models/
|
| 20 |
+
│ ├── gpt2.py # 标准 GPT-2 基线
|
| 21 |
+
│ ├── gpt2-new.py # 现代化 GPT-2(RoPE + SwiGLU + RMSNorm)
|
| 22 |
+
│ ├── reflow.py # reFlow 基础架构
|
| 23 |
+
│ ├── reflow-topk.py # reFlow + ReLU + Top-K 硬稀疏变体
|
| 24 |
+
│ └── reflow-lite.py # reFlow + GQA + 缩减 MLP 轻量变体
|
| 25 |
+
├── config/ # 训练 / 采样 / 评估配置文件
|
| 26 |
+
├── data/
|
| 27 |
+
│ ├── openwebtext/ # OpenWebText 数据集预处理
|
| 28 |
+
│ └── sft-lima/ # LIMA SFT 数据集预处理
|
| 29 |
+
└── out/ # Checkpoints 与实验报告输出
|
| 30 |
+
```
|
| 31 |
+
|
| 32 |
+
## 安装
|
| 33 |
+
|
| 34 |
+
### 环境要求
|
| 35 |
+
|
| 36 |
+
- Python 3.10+
|
| 37 |
+
- 支持 CUDA 的 GPU(实验环境:Tesla T4 x4)
|
| 38 |
+
|
| 39 |
+
### 1. 安装 PyTorch(CUDA 12.8)
|
| 40 |
+
|
| 41 |
+
```bash
|
| 42 |
+
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
|
| 43 |
+
```
|
| 44 |
+
|
| 45 |
+
> 请根据你的 CUDA 驱动版本调整 URL 中的版本号。详见 [PyTorch 官方安装指南](https://pytorch.org/get-started/locally/)。
|
| 46 |
+
|
| 47 |
+
### 2. 核心依赖
|
| 48 |
+
|
| 49 |
+
```bash
|
| 50 |
+
pip install datasets tiktoken wandb tqdm
|
| 51 |
+
```
|
| 52 |
+
|
| 53 |
+
### 3. 可解释性实验依赖
|
| 54 |
+
|
| 55 |
+
运行 `experiment.py` 需要额外安装以下包:
|
| 56 |
+
|
| 57 |
+
```bash
|
| 58 |
+
pip install numpy matplotlib seaborn scikit-learn scipy adjustText
|
| 59 |
+
```
|
| 60 |
+
|
| 61 |
+
### 一键安装(全部依赖)
|
| 62 |
+
|
| 63 |
+
```bash
|
| 64 |
+
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
|
| 65 |
+
pip install datasets tiktoken wandb tqdm numpy matplotlib seaborn scikit-learn scipy adjustText
|
| 66 |
+
```
|
| 67 |
+
|
| 68 |
+
## 数据准备
|
| 69 |
+
|
| 70 |
+
### OpenWebText
|
| 71 |
+
|
| 72 |
+
```bash
|
| 73 |
+
python data/openwebtext/prepare.py
|
| 74 |
+
```
|
| 75 |
+
|
| 76 |
+
该脚本会下载 OpenWebText 语料库(约 54 GB),使用 GPT-2 BPE 分词器进行编码。输出:`data/openwebtext/train.bin`(约 17 GB,约 90 亿 tokens)和 `val.bin`。
|
| 77 |
+
|
| 78 |
+
## 训练
|
| 79 |
+
|
| 80 |
+
所有超参数均在 `config/` 目录下的配置文件中指定,不支持命令行覆盖。
|
| 81 |
+
|
| 82 |
+
### 单卡训练
|
| 83 |
+
|
| 84 |
+
```bash
|
| 85 |
+
python train.py config/train_reflow_1.py
|
| 86 |
+
```
|
| 87 |
+
|
| 88 |
+
### 多卡训练(DDP)
|
| 89 |
+
|
| 90 |
+
```bash
|
| 91 |
+
torchrun --standalone --nproc_per_node=4 train.py config/train_reflow_1.py
|
| 92 |
+
```
|
| 93 |
+
|
| 94 |
+
### 可用训练配置
|
| 95 |
+
|
| 96 |
+
| 配置文件 | 架构 | 层数 | 参数量 | 说明 |
|
| 97 |
+
|---------|------|------|--------|------|
|
| 98 |
+
| `train_gpt2.py` | GPT-2 | 36 | 505.62M | 标准基线 |
|
| 99 |
+
| `train_gpt2_new.py` | GPT-2-New | 36 | 514.01M | + RoPE、SwiGLU、RMSNorm |
|
| 100 |
+
| `train_reflow_1.py` | reFlow | 32 | 463.67M | 基础 reFlow,恒定学习率 |
|
| 101 |
+
| `train_reflow_1_big.py` | reFlow | 36 | 515.06M | 学习率衰减,用于可解释性实验 |
|
| 102 |
+
| `train_reflow_1_topk_big.py` | reFlow-TopK | 36 | 515.06M | + ReLU + Top-64 稀疏化 |
|
| 103 |
+
| `train_reflow_1_lite.py` | reFlow-Lite | 32 | 413.34M | + GQA,缩减 MLP |
|
| 104 |
+
| `train_reflow_1_small.py` | reFlow | 6 | 46.47M | 小规模验证 |
|
| 105 |
+
|
| 106 |
+
### 断点续训
|
| 107 |
+
|
| 108 |
+
使用对应的 `_resume` 配置文件(如 `train_reflow_1_big_resume.py`)。
|
| 109 |
+
|
| 110 |
+
## 文本生成
|
| 111 |
+
|
| 112 |
+
```bash
|
| 113 |
+
python sample.py config/sample_reflow_1.py
|
| 114 |
+
```
|
| 115 |
+
|
| 116 |
+
在配置文件中修改 prompt、temperature、top_k 等参数。
|
| 117 |
+
|
| 118 |
+
## 可解释性实验
|
| 119 |
+
|
| 120 |
+
实验套件对训练好的 reFlow 模型执行 12 项分析,提供中英文两个版本:
|
| 121 |
+
|
| 122 |
+
```bash
|
| 123 |
+
python experiment.py config/train_reflow_1_big.py # 中文版
|
| 124 |
+
python experiment_en.py config/train_reflow_1_big.py # English
|
| 125 |
+
```
|
| 126 |
+
|
| 127 |
+
运行后将出现交互式菜单:
|
| 128 |
+
|
| 129 |
+
| 编号 | 实验名称 | 分组 |
|
| 130 |
+
|------|---------|------|
|
| 131 |
+
| 1 | 配方空间图谱 — 配方近邻与聚类热力图 | A. 信号本体 |
|
| 132 |
+
| 2 | 信号稀疏性分析 — 激活稀疏率统计 | A. 信号本体 |
|
| 133 |
+
| 3 | 信号基底几何 — 奇异值分解与有效秩 | A. 信号本体 |
|
| 134 |
+
| 4 | 语义星空图 — PCA 聚类可视化 | B. 语义性质 |
|
| 135 |
+
| 5 | 语义代数运算 — 向量算术(king − man + woman = queen) | B. 语义性质 |
|
| 136 |
+
| 6 | 拼写鲁棒性 — 对拼写错误的容忍度 | B. 语义性质 |
|
| 137 |
+
| 7 | 层级概率演化 — 逐层预测概率结晶过程 | C. 机械分析 |
|
| 138 |
+
| 8 | 信号流追踪 — 信号激活热力图 | C. 机械分析 |
|
| 139 |
+
| 9 | 因果消融曲线 — 逐信号消融概率变化 | C. 机械分析 |
|
| 140 |
+
| 10 | 情绪手术 — 信号注入实现情感翻转 | D. 操控验证 |
|
| 141 |
+
| 11 | 概念注入 — 二分搜索概念植入 | D. 操控验证 |
|
| 142 |
+
| 12 | 基因库篡改 — 全局配方矩阵操控 | D. 操控验证 |
|
| 143 |
+
|
| 144 |
+
输入 `all` 运行全部实验,或输入编号(如 `1 3 5`)选择性运行。实验报告保存至 `out/<模型名>/audit_reports/`。
|
| 145 |
+
|
| 146 |
+
## Checkpoint 检查
|
| 147 |
+
|
| 148 |
+
```bash
|
| 149 |
+
python check.py config/train_reflow_1.py out/reflow-1/ckpt.pt
|
| 150 |
+
```
|
| 151 |
+
|
| 152 |
+
## 复现论文实验
|
| 153 |
+
|
| 154 |
+
### 第 4 章:收敛实验
|
| 155 |
+
|
| 156 |
+
依次训练各模型变体并对比训练曲线:
|
| 157 |
+
|
| 158 |
+
```bash
|
| 159 |
+
# GPT-2 基线
|
| 160 |
+
torchrun --standalone --nproc_per_node=4 train.py config/train_gpt2.py
|
| 161 |
+
|
| 162 |
+
# GPT-2-New(现代化组件)
|
| 163 |
+
torchrun --standalone --nproc_per_node=4 train.py config/train_gpt2_new.py
|
| 164 |
+
|
| 165 |
+
# reFlow 基础版
|
| 166 |
+
torchrun --standalone --nproc_per_node=4 train.py config/train_reflow_1.py
|
| 167 |
+
|
| 168 |
+
# reFlow-Big(用于第 5-6 章实验)
|
| 169 |
+
torchrun --standalone --nproc_per_node=4 train.py config/train_reflow_1_big.py
|
| 170 |
+
|
| 171 |
+
# reFlow-TopK-Big(硬稀疏变体)
|
| 172 |
+
torchrun --standalone --nproc_per_node=4 train.py config/train_reflow_1_topk_big.py
|
| 173 |
+
|
| 174 |
+
# reFlow-Lite(轻量版)
|
| 175 |
+
torchrun --standalone --nproc_per_node=4 train.py config/train_reflow_1_lite.py
|
| 176 |
+
|
| 177 |
+
# reFlow-Small(小规模验证)
|
| 178 |
+
torchrun --standalone --nproc_per_node=4 train.py config/train_reflow_1_small.py
|
| 179 |
+
```
|
| 180 |
+
|
| 181 |
+
### 第 5 章:可解释性实验(reFlow-1-Big)
|
| 182 |
+
|
| 183 |
+
```bash
|
| 184 |
+
python experiment.py config/train_reflow_1_big.py
|
| 185 |
+
# 输入 all 运行全部 12 项实验
|
| 186 |
+
```
|
| 187 |
+
|
| 188 |
+
### 第 6 章:硬稀疏对比实验(reFlow-1-TopK-Big)
|
| 189 |
+
|
| 190 |
+
```bash
|
| 191 |
+
python experiment.py config/train_reflow_1_topk_big.py
|
| 192 |
+
# 输入 all 运行全部 12 项实验
|
| 193 |
+
```
|
| 194 |
+
|
| 195 |
+
对比两组实验报告即可复现论文第 6 章的对比表格。
|
| 196 |
+
|
| 197 |
+
## 许可证
|
| 198 |
+
|
| 199 |
+
MIT License。基于 [nanoGPT](https://github.com/karpathy/nanoGPT)(Andrej Karpathy)二次开发。
|
bench.py
ADDED
|
@@ -0,0 +1,182 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Benchmark script for model performance testing
|
| 3 |
+
|
| 4 |
+
REQUIRED:
|
| 5 |
+
1. You must specify a config file from the config/ directory
|
| 6 |
+
2. All configuration must be in the config file. No CLI overrides allowed
|
| 7 |
+
|
| 8 |
+
Usage:
|
| 9 |
+
python bench.py <config_file>
|
| 10 |
+
|
| 11 |
+
Example:
|
| 12 |
+
python bench.py config/bench_gpt2.py
|
| 13 |
+
"""
|
| 14 |
+
import sys
|
| 15 |
+
import os
|
| 16 |
+
|
| 17 |
+
# -----------------------------------------------------------------------------
|
| 18 |
+
# Configuration loading (BEFORE imports to validate config first)
|
| 19 |
+
# -----------------------------------------------------------------------------
|
| 20 |
+
if len(sys.argv) != 2:
|
| 21 |
+
print("ERROR: Invalid arguments!")
|
| 22 |
+
print("Usage: python bench.py <config_file>")
|
| 23 |
+
print("Available configs in config/:")
|
| 24 |
+
print(" - bench_gpt2.py")
|
| 25 |
+
sys.exit(1)
|
| 26 |
+
|
| 27 |
+
config_file = sys.argv[1]
|
| 28 |
+
|
| 29 |
+
# Disallow --key=value arguments
|
| 30 |
+
for arg in sys.argv[1:]:
|
| 31 |
+
if arg.startswith('--'):
|
| 32 |
+
print(f"ERROR: CLI overrides are not supported. All config must be in file: {config_file}")
|
| 33 |
+
sys.exit(1)
|
| 34 |
+
|
| 35 |
+
# Load config
|
| 36 |
+
print(f"Loading config from: {config_file}")
|
| 37 |
+
exec(open(config_file).read())
|
| 38 |
+
|
| 39 |
+
# Validate required config keys
|
| 40 |
+
required_keys = ['model_config']
|
| 41 |
+
missing_keys = [k for k in required_keys if k not in globals()]
|
| 42 |
+
if missing_keys:
|
| 43 |
+
print(f"ERROR: Missing required config keys: {missing_keys}")
|
| 44 |
+
sys.exit(1)
|
| 45 |
+
|
| 46 |
+
# Load model configuration
|
| 47 |
+
model_config = globals()['model_config']
|
| 48 |
+
model_file = f"models/{model_config}.py"
|
| 49 |
+
try:
|
| 50 |
+
exec(open(model_file).read())
|
| 51 |
+
except FileNotFoundError:
|
| 52 |
+
print(f"ERROR: Model file not found: {model_file}")
|
| 53 |
+
sys.exit(1)
|
| 54 |
+
|
| 55 |
+
# Get model-specific required config keys from GPTConfig
|
| 56 |
+
model_required_keys = []
|
| 57 |
+
if 'GPTConfig' in globals():
|
| 58 |
+
config_class = globals()['GPTConfig']
|
| 59 |
+
import dataclasses
|
| 60 |
+
for field in dataclasses.fields(config_class):
|
| 61 |
+
model_required_keys.append(field.name)
|
| 62 |
+
|
| 63 |
+
# Validate model-specific config keys
|
| 64 |
+
if init_from == 'scratch':
|
| 65 |
+
missing_model_keys = [k for k in model_required_keys if k not in globals()]
|
| 66 |
+
if missing_model_keys:
|
| 67 |
+
print(f"ERROR: Missing required model config keys for {model_config}: {missing_model_keys}")
|
| 68 |
+
sys.exit(1)
|
| 69 |
+
|
| 70 |
+
# Print configuration
|
| 71 |
+
print("\n" + "=" * 60)
|
| 72 |
+
print("BENCH CONFIGURATION")
|
| 73 |
+
print("=" * 60)
|
| 74 |
+
for key in sorted(globals().keys()):
|
| 75 |
+
val = globals().get(key)
|
| 76 |
+
if isinstance(val, (int, float, bool, str)) and not key.startswith('_'):
|
| 77 |
+
print(f" {key:30s} = {val}")
|
| 78 |
+
print("=" * 60 + "\n")
|
| 79 |
+
|
| 80 |
+
# Now import dependencies
|
| 81 |
+
import os
|
| 82 |
+
from contextlib import nullcontext
|
| 83 |
+
import numpy as np
|
| 84 |
+
import time
|
| 85 |
+
import torch
|
| 86 |
+
|
| 87 |
+
# Import GPTConfig and GPT
|
| 88 |
+
GPTConfig = globals()['GPTConfig']
|
| 89 |
+
GPT = globals()['GPT']
|
| 90 |
+
|
| 91 |
+
# Auto-detect dtype
|
| 92 |
+
if dtype == 'bfloat16' and not (torch.cuda.is_available() and torch.cuda.is_bf16_supported()):
|
| 93 |
+
dtype = 'float16'
|
| 94 |
+
|
| 95 |
+
torch.manual_seed(seed)
|
| 96 |
+
torch.cuda.manual_seed(seed)
|
| 97 |
+
torch.backends.cuda.matmul.allow_tf32 = True
|
| 98 |
+
torch.backends.cudnn.allow_tf32 = True
|
| 99 |
+
device_type = 'cuda' if 'cuda' in device else 'cpu'
|
| 100 |
+
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
|
| 101 |
+
ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype)
|
| 102 |
+
|
| 103 |
+
# data loading
|
| 104 |
+
if real_data:
|
| 105 |
+
dataset = globals().get('dataset', 'openwebtext')
|
| 106 |
+
data_dir = os.path.join('data', dataset)
|
| 107 |
+
train_data = np.memmap(os.path.join(data_dir, 'train.bin'), dtype=np.uint16, mode='r')
|
| 108 |
+
def get_batch(split):
|
| 109 |
+
data = train_data
|
| 110 |
+
ix = torch.randint(len(data) - block_size, (batch_size,))
|
| 111 |
+
x = torch.stack([torch.from_numpy((data[i:i+block_size]).astype(np.int64)) for i in ix])
|
| 112 |
+
y = torch.stack([torch.from_numpy((data[i+1:i+1+block_size]).astype(np.int64)) for i in ix])
|
| 113 |
+
x, y = x.pin_memory().to(device, non_blocking=True), y.pin_memory().to(device, non_blocking=True)
|
| 114 |
+
return x, y
|
| 115 |
+
else:
|
| 116 |
+
x = torch.randint(50304, (batch_size, block_size), device=device)
|
| 117 |
+
y = torch.randint(50304, (batch_size, block_size), device=device)
|
| 118 |
+
get_batch = lambda split: (x, y)
|
| 119 |
+
|
| 120 |
+
# model init
|
| 121 |
+
gptconf = GPTConfig(
|
| 122 |
+
block_size=block_size,
|
| 123 |
+
n_layer=n_layer,
|
| 124 |
+
n_head=n_head,
|
| 125 |
+
n_embd=n_embd,
|
| 126 |
+
dropout=0,
|
| 127 |
+
bias=bias,
|
| 128 |
+
)
|
| 129 |
+
model = GPT(gptconf)
|
| 130 |
+
model.to(device)
|
| 131 |
+
|
| 132 |
+
optimizer = model.configure_optimizers(weight_decay=1e-2, learning_rate=1e-4, betas=(0.9, 0.95), device_type=device_type)
|
| 133 |
+
|
| 134 |
+
if compile:
|
| 135 |
+
print("Compiling model...")
|
| 136 |
+
model = torch.compile(model)
|
| 137 |
+
|
| 138 |
+
if profile:
|
| 139 |
+
wait, warmup, active = 5, 5, 5
|
| 140 |
+
num_steps = wait + warmup + active
|
| 141 |
+
with torch.profiler.profile(
|
| 142 |
+
activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],
|
| 143 |
+
schedule=torch.profiler.schedule(wait=wait, warmup=warmup, active=active, repeat=1),
|
| 144 |
+
on_trace_ready=torch.profiler.tensorboard_trace_handler('./bench_log'),
|
| 145 |
+
record_shapes=False,
|
| 146 |
+
profile_memory=False,
|
| 147 |
+
with_stack=False,
|
| 148 |
+
with_flops=True,
|
| 149 |
+
with_modules=False,
|
| 150 |
+
) as prof:
|
| 151 |
+
X, Y = get_batch('train')
|
| 152 |
+
for k in range(num_steps):
|
| 153 |
+
with ctx:
|
| 154 |
+
logits, loss = model(X, Y)
|
| 155 |
+
X, Y = get_batch('train')
|
| 156 |
+
optimizer.zero_grad(set_to_none=True)
|
| 157 |
+
loss.backward()
|
| 158 |
+
optimizer.step()
|
| 159 |
+
lossf = loss.item()
|
| 160 |
+
print(f"{k}/{num_steps} loss: {lossf:.4f}")
|
| 161 |
+
prof.step()
|
| 162 |
+
else:
|
| 163 |
+
# simple benchmarking
|
| 164 |
+
torch.cuda.synchronize()
|
| 165 |
+
for stage, num_steps in enumerate([10, 20]):
|
| 166 |
+
t0 = time.time()
|
| 167 |
+
X, Y = get_batch('train')
|
| 168 |
+
for k in range(num_steps):
|
| 169 |
+
with ctx:
|
| 170 |
+
logits, loss = model(X, Y)
|
| 171 |
+
X, Y = get_batch('train')
|
| 172 |
+
optimizer.zero_grad(set_to_none=True)
|
| 173 |
+
loss.backward()
|
| 174 |
+
optimizer.step()
|
| 175 |
+
lossf = loss.item()
|
| 176 |
+
print(f"{k}/{num_steps} loss: {lossf:.4f}")
|
| 177 |
+
torch.cuda.synchronize()
|
| 178 |
+
t1 = time.time()
|
| 179 |
+
dt = t1 - t0
|
| 180 |
+
mfu = model.estimate_mfu(batch_size * 1 * num_steps, dt)
|
| 181 |
+
if stage == 1:
|
| 182 |
+
print(f"time per iteration: {dt/num_steps*1000:.4f}ms, MFU: {mfu*100:.2f}%")
|
check.py
ADDED
|
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Check model parameters from a checkpoint
|
| 3 |
+
|
| 4 |
+
Usage:
|
| 5 |
+
python check.py <config_file> <checkpoint_file>
|
| 6 |
+
|
| 7 |
+
Example:
|
| 8 |
+
python check.py config/train_reflow_web.py out-web/ckpt.pt
|
| 9 |
+
"""
|
| 10 |
+
import sys
|
| 11 |
+
import os
|
| 12 |
+
import torch
|
| 13 |
+
|
| 14 |
+
# -----------------------------------------------------------------------------
|
| 15 |
+
# Configuration loading
|
| 16 |
+
# -----------------------------------------------------------------------------
|
| 17 |
+
if len(sys.argv) != 3:
|
| 18 |
+
print("ERROR: Invalid arguments!")
|
| 19 |
+
print("Usage: python check.py <config_file> <checkpoint_file>")
|
| 20 |
+
print("Example: python check.py config/train_reflow_web.py out-web/ckpt.pt")
|
| 21 |
+
sys.exit(1)
|
| 22 |
+
|
| 23 |
+
config_file = sys.argv[1]
|
| 24 |
+
checkpoint_file = sys.argv[2]
|
| 25 |
+
|
| 26 |
+
if not os.path.exists(config_file):
|
| 27 |
+
print(f"ERROR: Config file not found: {config_file}")
|
| 28 |
+
sys.exit(1)
|
| 29 |
+
|
| 30 |
+
if not os.path.exists(checkpoint_file):
|
| 31 |
+
print(f"ERROR: Checkpoint file not found: {checkpoint_file}")
|
| 32 |
+
sys.exit(1)
|
| 33 |
+
|
| 34 |
+
# Load config
|
| 35 |
+
print(f"Loading config from: {config_file}")
|
| 36 |
+
exec(open(config_file).read())
|
| 37 |
+
|
| 38 |
+
# Load model configuration
|
| 39 |
+
model_config = globals().get('model_config')
|
| 40 |
+
if not model_config:
|
| 41 |
+
print("ERROR: 'model_config' is required in config file")
|
| 42 |
+
sys.exit(1)
|
| 43 |
+
|
| 44 |
+
model_file = f"models/{model_config}.py"
|
| 45 |
+
try:
|
| 46 |
+
exec(open(model_file).read())
|
| 47 |
+
except FileNotFoundError:
|
| 48 |
+
print(f"ERROR: Model file not found: {model_file}")
|
| 49 |
+
sys.exit(1)
|
| 50 |
+
|
| 51 |
+
# Import GPTConfig and GPT
|
| 52 |
+
GPTConfig = globals()['GPTConfig']
|
| 53 |
+
GPT = globals()['GPT']
|
| 54 |
+
|
| 55 |
+
# Load checkpoint
|
| 56 |
+
print(f"Loading checkpoint from: {checkpoint_file}")
|
| 57 |
+
checkpoint = torch.load(checkpoint_file, map_location='cpu')
|
| 58 |
+
model_args = checkpoint['model_args']
|
| 59 |
+
|
| 60 |
+
# Create model and load weights
|
| 61 |
+
gptconf = GPTConfig(**model_args)
|
| 62 |
+
model = GPT(gptconf)
|
| 63 |
+
state_dict = checkpoint['model']
|
| 64 |
+
|
| 65 |
+
# Handle PyTorch 2.0+ compiled model keys
|
| 66 |
+
unwanted_prefix = '_orig_mod.'
|
| 67 |
+
for k in list(state_dict.keys()):
|
| 68 |
+
if k.startswith(unwanted_prefix):
|
| 69 |
+
state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
|
| 70 |
+
|
| 71 |
+
model.load_state_dict(state_dict)
|
| 72 |
+
|
| 73 |
+
# Print model information
|
| 74 |
+
print("\n" + "=" * 60)
|
| 75 |
+
print("MODEL INFORMATION")
|
| 76 |
+
print("=" * 60)
|
| 77 |
+
|
| 78 |
+
print(f"\nModel Architecture: {model_config}")
|
| 79 |
+
print(f"Checkpoint: {checkpoint_file}")
|
| 80 |
+
|
| 81 |
+
print(f"\nModel Arguments:")
|
| 82 |
+
for k, v in model_args.items():
|
| 83 |
+
print(f" {k:20s} = {v}")
|
| 84 |
+
|
| 85 |
+
print(f"\nTotal Parameters: {model.get_num_params()/1e6:.2f}M")
|
| 86 |
+
|
| 87 |
+
# Count parameters by component
|
| 88 |
+
if hasattr(model, 'transformer'):
|
| 89 |
+
print("\nParameters by component:")
|
| 90 |
+
|
| 91 |
+
# Show wte (embedding) - for Reflow includes vocab_to_signals + signal_basis
|
| 92 |
+
if hasattr(model.transformer, 'wte'):
|
| 93 |
+
wte = model.transformer.wte
|
| 94 |
+
if hasattr(wte, 'vocab_to_signals'):
|
| 95 |
+
vocab_to_signals_params = wte.vocab_to_signals.weight.numel()
|
| 96 |
+
print(f" transformer.wte.vocab_to_signals: {vocab_to_signals_params/1e6:>10.2f}M")
|
| 97 |
+
if hasattr(wte, 'signal_basis'):
|
| 98 |
+
signal_basis_params = wte.signal_basis.numel()
|
| 99 |
+
print(f" transformer.wte.signal_basis: {signal_basis_params/1e6:>10.2f}M")
|
| 100 |
+
wte_params = sum(p.numel() for p in wte.parameters())
|
| 101 |
+
print(f" transformer.wte (total): {wte_params/1e6:>10.2f}M")
|
| 102 |
+
|
| 103 |
+
# Count transformer.h (layers)
|
| 104 |
+
if hasattr(model.transformer, 'h'):
|
| 105 |
+
h_params = sum(p.numel() for p in model.transformer.h.parameters())
|
| 106 |
+
print(f" transformer.h (all layers): {h_params/1e6:>10.2f}M")
|
| 107 |
+
|
| 108 |
+
# Show ln_f
|
| 109 |
+
if hasattr(model.transformer, 'ln_f'):
|
| 110 |
+
ln_f_params = sum(p.numel() for p in model.transformer.ln_f.parameters())
|
| 111 |
+
print(f" transformer.ln_f: {ln_f_params/1e6:>10.2f}M")
|
| 112 |
+
|
| 113 |
+
print(f"\nTotal trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad)/1e6:.2f}M")
|
| 114 |
+
print(f"Total non-trainable parameters: {sum(p.numel() for p in model.parameters() if not p.requires_grad)/1e6:.2f}M")
|
| 115 |
+
|
| 116 |
+
# Training info if available
|
| 117 |
+
if 'iter_num' in checkpoint:
|
| 118 |
+
print(f"\nTraining Info:")
|
| 119 |
+
print(f" iter_num: {checkpoint['iter_num']}")
|
| 120 |
+
print(f" best_val_loss: {checkpoint.get('best_val_loss', 'N/A')}")
|
| 121 |
+
|
| 122 |
+
print("=" * 60)
|
config/base.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
out_dir = 'out'
|
| 3 |
+
eval_interval = 2000
|
| 4 |
+
log_interval = 1
|
| 5 |
+
eval_iters = 200
|
| 6 |
+
eval_only = False
|
| 7 |
+
always_save_checkpoint = True
|
| 8 |
+
init_from = 'scratch'
|
| 9 |
+
|
| 10 |
+
wandb_log = False
|
| 11 |
+
wandb_project = 'owt'
|
| 12 |
+
wandb_run_name = 'gpt2'
|
| 13 |
+
|
| 14 |
+
dataset = 'openwebtext'
|
| 15 |
+
gradient_accumulation_steps = 5 * 8
|
| 16 |
+
batch_size = 12
|
| 17 |
+
block_size = 1024
|
| 18 |
+
|
| 19 |
+
n_layer = 12
|
| 20 |
+
n_head = 12
|
| 21 |
+
n_embd = 768
|
| 22 |
+
dropout = 0.0
|
| 23 |
+
bias = False
|
| 24 |
+
|
| 25 |
+
learning_rate = 6e-4
|
| 26 |
+
max_iters = 600000
|
| 27 |
+
weight_decay = 1e-1
|
| 28 |
+
beta1 = 0.9
|
| 29 |
+
beta2 = 0.95
|
| 30 |
+
grad_clip = 1.0
|
| 31 |
+
|
| 32 |
+
decay_lr = True
|
| 33 |
+
warmup_iters = 2000
|
| 34 |
+
lr_decay_iters = 600000
|
| 35 |
+
min_lr = 6e-5
|
| 36 |
+
|
| 37 |
+
backend = 'nccl'
|
| 38 |
+
|
| 39 |
+
device = 'cuda'
|
| 40 |
+
dtype = 'bfloat16'
|
| 41 |
+
compile = True
|
config/bench_base.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
batch_size = 12
|
| 3 |
+
block_size = 1024
|
| 4 |
+
bias = False
|
| 5 |
+
real_data = True
|
| 6 |
+
seed = 1337
|
| 7 |
+
device = 'cuda'
|
| 8 |
+
dtype = 'bfloat16'
|
| 9 |
+
compile = True
|
| 10 |
+
profile = False
|
| 11 |
+
|
| 12 |
+
n_layer = 12
|
| 13 |
+
n_head = 12
|
| 14 |
+
n_embd = 768
|
config/bench_gpt2.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
model_config = 'gpt2'
|
| 3 |
+
|
| 4 |
+
batch_size = 12
|
| 5 |
+
block_size = 1024
|
| 6 |
+
bias = False
|
| 7 |
+
real_data = True
|
| 8 |
+
seed = 1337
|
| 9 |
+
device = 'cuda'
|
| 10 |
+
dtype = 'bfloat16'
|
| 11 |
+
compile = True
|
| 12 |
+
profile = False
|
| 13 |
+
|
| 14 |
+
n_layer = 12
|
| 15 |
+
n_head = 12
|
| 16 |
+
n_embd = 768
|
| 17 |
+
|
| 18 |
+
init_from = 'scratch'
|
config/eval_gpt2.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
batch_size = 8
|
| 2 |
+
eval_iters = 500
|
| 3 |
+
eval_only = True
|
| 4 |
+
wandb_log = False
|
| 5 |
+
init_from = 'gpt2'
|
config/eval_gpt2_large.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
batch_size = 8
|
| 2 |
+
eval_iters = 500
|
| 3 |
+
eval_only = True
|
| 4 |
+
wandb_log = False
|
| 5 |
+
init_from = 'gpt2-large'
|
config/eval_gpt2_medium.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
batch_size = 8
|
| 2 |
+
eval_iters = 500
|
| 3 |
+
eval_only = True
|
| 4 |
+
wandb_log = False
|
| 5 |
+
init_from = 'gpt2-medium'
|
config/eval_gpt2_xl.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
batch_size = 8
|
| 2 |
+
eval_iters = 500
|
| 3 |
+
eval_only = True
|
| 4 |
+
wandb_log = False
|
| 5 |
+
init_from = 'gpt2-xl'
|
config/finetune_shakespeare.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import time
|
| 3 |
+
|
| 4 |
+
model_config = 'gpt2'
|
| 5 |
+
|
| 6 |
+
out_dir = 'out-shakespeare'
|
| 7 |
+
eval_interval = 5
|
| 8 |
+
log_interval = 1
|
| 9 |
+
eval_iters = 40
|
| 10 |
+
eval_only = False
|
| 11 |
+
always_save_checkpoint = False
|
| 12 |
+
init_from = 'gpt2-xl'
|
| 13 |
+
|
| 14 |
+
wandb_log = False
|
| 15 |
+
wandb_project = 'shakespeare'
|
| 16 |
+
wandb_run_name = 'ft-' + str(time.time())
|
| 17 |
+
|
| 18 |
+
dataset = 'shakespeare'
|
| 19 |
+
gradient_accumulation_steps = 32
|
| 20 |
+
batch_size = 1
|
| 21 |
+
block_size = 1024
|
| 22 |
+
|
| 23 |
+
n_layer = None
|
| 24 |
+
n_head = None
|
| 25 |
+
n_embd = None
|
| 26 |
+
dropout = 0.1
|
| 27 |
+
bias = False
|
| 28 |
+
|
| 29 |
+
learning_rate = 3e-5
|
| 30 |
+
max_iters = 20
|
| 31 |
+
weight_decay = 1e-1
|
| 32 |
+
beta1 = 0.9
|
| 33 |
+
beta2 = 0.95
|
| 34 |
+
grad_clip = 1.0
|
| 35 |
+
|
| 36 |
+
decay_lr = False
|
| 37 |
+
warmup_iters = 0
|
| 38 |
+
lr_decay_iters = 20
|
| 39 |
+
min_lr = 3e-5
|
| 40 |
+
|
| 41 |
+
backend = 'nccl'
|
| 42 |
+
|
| 43 |
+
device = 'cuda'
|
| 44 |
+
dtype = 'bfloat16'
|
| 45 |
+
compile = True
|
config/sample_base.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
init_from = 'resume'
|
| 3 |
+
out_dir = 'out'
|
| 4 |
+
start = "\n"
|
| 5 |
+
num_samples = 10
|
| 6 |
+
max_new_tokens = 500
|
| 7 |
+
temperature = 0.8
|
| 8 |
+
top_k = 200
|
| 9 |
+
seed = 1337
|
| 10 |
+
device = 'cuda'
|
| 11 |
+
dtype = 'bfloat16'
|
| 12 |
+
compile = False
|
config/sample_gpt2.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
model_config = 'gpt2'
|
| 3 |
+
|
| 4 |
+
out_dir = 'out/gpt2'
|
| 5 |
+
init_from = 'resume'
|
| 6 |
+
|
| 7 |
+
start = "The sun is bright, the night is dark, fire is hot, and ice is"
|
| 8 |
+
num_samples = 10
|
| 9 |
+
max_new_tokens = 500
|
| 10 |
+
temperature = 0.8
|
| 11 |
+
top_k = 200
|
| 12 |
+
|
| 13 |
+
seed = 1337
|
| 14 |
+
|
| 15 |
+
device = 'cuda'
|
| 16 |
+
dtype = 'bfloat16'
|
| 17 |
+
compile = False
|
config/sample_gpt2_new.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
model_config = 'gpt2-new'
|
| 3 |
+
|
| 4 |
+
out_dir = 'out/gpt2-new'
|
| 5 |
+
init_from = 'resume'
|
| 6 |
+
|
| 7 |
+
start = "Q: What is the capital of China?\nA: Beijing\nQ: What is the capital of Japan?\nA: Tokyo\nQ: What is the capital of France?\nA:"
|
| 8 |
+
num_samples = 10
|
| 9 |
+
max_new_tokens = 50
|
| 10 |
+
temperature = 0.01
|
| 11 |
+
top_k = 20
|
| 12 |
+
|
| 13 |
+
seed = 1337
|
| 14 |
+
|
| 15 |
+
device = 'cuda'
|
| 16 |
+
dtype = 'float16'
|
| 17 |
+
compile = False
|
config/sample_gpt2_new_nolr.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
model_config = 'gpt2-new'
|
| 3 |
+
|
| 4 |
+
out_dir = 'out/gpt2-new-nolr'
|
| 5 |
+
init_from = 'resume'
|
| 6 |
+
|
| 7 |
+
start = "The sun is bright, the night is dark, fire is hot, and ice is"
|
| 8 |
+
num_samples = 10
|
| 9 |
+
max_new_tokens = 500
|
| 10 |
+
temperature = 0.8
|
| 11 |
+
top_k = 200
|
| 12 |
+
|
| 13 |
+
seed = 1337
|
| 14 |
+
|
| 15 |
+
device = 'cuda'
|
| 16 |
+
dtype = 'bfloat16'
|
| 17 |
+
compile = False
|
config/sample_reflow_1.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
model_config = 'reflow'
|
| 3 |
+
|
| 4 |
+
out_dir = 'out/reflow-1'
|
| 5 |
+
init_from = 'resume'
|
| 6 |
+
|
| 7 |
+
start = "Q: What is the capital of China?\nA: Beijing\nQ: What is the capital of Japan?\nA: Tokyo\nQ: What is the capital of France?\nA:"
|
| 8 |
+
num_samples = 10
|
| 9 |
+
max_new_tokens = 50
|
| 10 |
+
temperature = 0.01
|
| 11 |
+
top_k = 20
|
| 12 |
+
|
| 13 |
+
seed = 1337
|
| 14 |
+
|
| 15 |
+
device = 'cuda'
|
| 16 |
+
dtype = 'float16'
|
| 17 |
+
compile = False
|
config/sample_reflow_1_big.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
model_config = 'reflow'
|
| 3 |
+
|
| 4 |
+
out_dir = 'out/reflow-1-big'
|
| 5 |
+
init_from = 'resume'
|
| 6 |
+
|
| 7 |
+
start = "The sun is bright, the night is dark, fire is hot, and ice is"
|
| 8 |
+
num_samples = 10
|
| 9 |
+
max_new_tokens = 500
|
| 10 |
+
temperature = 0.8
|
| 11 |
+
top_k = 200
|
| 12 |
+
|
| 13 |
+
seed = 1337
|
| 14 |
+
|
| 15 |
+
device = 'cuda'
|
| 16 |
+
dtype = 'float16'
|
| 17 |
+
compile = False
|
config/sample_reflow_1_lite.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
model_config = 'reflow-lite'
|
| 3 |
+
|
| 4 |
+
out_dir = 'out/reflow-1-lite'
|
| 5 |
+
init_from = 'resume'
|
| 6 |
+
|
| 7 |
+
start = "The sun is bright, the night is dark, fire is hot, and ice is"
|
| 8 |
+
num_samples = 10
|
| 9 |
+
max_new_tokens = 500
|
| 10 |
+
temperature = 0.8
|
| 11 |
+
top_k = 200
|
| 12 |
+
|
| 13 |
+
seed = 1337
|
| 14 |
+
|
| 15 |
+
device = 'cuda'
|
| 16 |
+
dtype = 'float16'
|
| 17 |
+
compile = False
|
config/sample_reflow_1_small.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
model_config = 'reflow'
|
| 3 |
+
|
| 4 |
+
out_dir = 'out/reflow-1-small'
|
| 5 |
+
init_from = 'resume'
|
| 6 |
+
|
| 7 |
+
start = "The sun is bright, the night is dark, fire is hot, and ice is"
|
| 8 |
+
num_samples = 10
|
| 9 |
+
max_new_tokens = 500
|
| 10 |
+
temperature = 0.8
|
| 11 |
+
top_k = 200
|
| 12 |
+
|
| 13 |
+
seed = 1337
|
| 14 |
+
|
| 15 |
+
device = 'cuda'
|
| 16 |
+
dtype = 'float16'
|
| 17 |
+
compile = False
|
config/sample_reflow_1_small_sp.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
model_config = 'reflow'
|
| 3 |
+
|
| 4 |
+
out_dir = 'out/reflow-1-small-sp'
|
| 5 |
+
init_from = 'resume'
|
| 6 |
+
|
| 7 |
+
start = "The sun is bright, the night is dark, fire is hot, and ice is"
|
| 8 |
+
num_samples = 10
|
| 9 |
+
max_new_tokens = 500
|
| 10 |
+
temperature = 0.8
|
| 11 |
+
top_k = 200
|
| 12 |
+
|
| 13 |
+
seed = 1337
|
| 14 |
+
|
| 15 |
+
device = 'cuda'
|
| 16 |
+
dtype = 'float16'
|
| 17 |
+
compile = False
|
config/sample_reflow_1_topk.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
model_config = 'reflow-topk'
|
| 3 |
+
|
| 4 |
+
out_dir = 'out/reflow-1-topk'
|
| 5 |
+
init_from = 'resume'
|
| 6 |
+
|
| 7 |
+
start = "Q: What is the capital of China?\nA: Beijing\nQ: What is the capital of Japan?\nA: Tokyo\nQ: What is the capital of France?\nA:"
|
| 8 |
+
num_samples = 10
|
| 9 |
+
max_new_tokens = 50
|
| 10 |
+
temperature = 0.01
|
| 11 |
+
top_k = 20
|
| 12 |
+
|
| 13 |
+
seed = 1337
|
| 14 |
+
|
| 15 |
+
device = 'cuda'
|
| 16 |
+
dtype = 'float16'
|
| 17 |
+
compile = False
|
config/sample_reflow_1_topk_big.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
model_config = 'reflow-topk'
|
| 3 |
+
|
| 4 |
+
out_dir = 'out/reflow-1-topk-big'
|
| 5 |
+
init_from = 'resume'
|
| 6 |
+
|
| 7 |
+
start = "Q: What is the capital of China?\nA: Beijing\nQ: What is the capital of Japan?\nA: Tokyo\nQ: What is the capital of France?\nA:"
|
| 8 |
+
num_samples = 10
|
| 9 |
+
max_new_tokens = 50
|
| 10 |
+
temperature = 0.01
|
| 11 |
+
top_k = 20
|
| 12 |
+
|
| 13 |
+
seed = 1337
|
| 14 |
+
|
| 15 |
+
device = 'cuda'
|
| 16 |
+
dtype = 'float16'
|
| 17 |
+
compile = False
|
config/sample_sft_reflow_1.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
model_config = 'reflow'
|
| 3 |
+
|
| 4 |
+
out_dir = 'out/sft-lima-reflow-1'
|
| 5 |
+
init_from = 'resume'
|
| 6 |
+
|
| 7 |
+
start = "Question: Which city is the capital of France?\nAnswer: "
|
| 8 |
+
num_samples = 10
|
| 9 |
+
max_new_tokens = 500
|
| 10 |
+
temperature = 0.1
|
| 11 |
+
top_k = 20
|
| 12 |
+
|
| 13 |
+
seed = 1337
|
| 14 |
+
|
| 15 |
+
device = 'cuda'
|
| 16 |
+
dtype = 'float16'
|
| 17 |
+
compile = False
|
config/train_gpt2.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
model_config = 'gpt2'
|
| 2 |
+
|
| 3 |
+
log_file = 'logs/gpt2.log'
|
| 4 |
+
|
| 5 |
+
out_dir = 'out/gpt2'
|
| 6 |
+
eval_interval = 500
|
| 7 |
+
log_interval = 1
|
| 8 |
+
eval_iters = 500
|
| 9 |
+
eval_only = False
|
| 10 |
+
always_save_checkpoint = False
|
| 11 |
+
init_from = 'scratch'
|
| 12 |
+
|
| 13 |
+
wandb_log = False
|
| 14 |
+
wandb_project = 'owt'
|
| 15 |
+
wandb_run_name = 'gpt2'
|
| 16 |
+
|
| 17 |
+
dataset = 'openwebtext'
|
| 18 |
+
gradient_accumulation_steps = 64
|
| 19 |
+
batch_size = 1
|
| 20 |
+
block_size = 1024
|
| 21 |
+
|
| 22 |
+
n_layer = 36
|
| 23 |
+
n_head = 16
|
| 24 |
+
n_embd = 1024
|
| 25 |
+
vocab_size = 50304
|
| 26 |
+
dropout = 0.0
|
| 27 |
+
bias = False
|
| 28 |
+
|
| 29 |
+
learning_rate = 3e-4
|
| 30 |
+
max_iters = 50000
|
| 31 |
+
weight_decay = 1e-1
|
| 32 |
+
beta1 = 0.9
|
| 33 |
+
beta2 = 0.95
|
| 34 |
+
grad_clip = 1.0
|
| 35 |
+
|
| 36 |
+
decay_lr = True
|
| 37 |
+
warmup_iters = 1000
|
| 38 |
+
lr_decay_iters = 50000
|
| 39 |
+
min_lr = 3e-5
|
| 40 |
+
|
| 41 |
+
backend = 'nccl'
|
| 42 |
+
|
| 43 |
+
device = 'cuda'
|
| 44 |
+
dtype = 'float16'
|
| 45 |
+
compile = True
|
config/train_gpt2_new.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
model_config = 'gpt2-new'
|
| 2 |
+
|
| 3 |
+
log_file = 'logs/gpt2-new.log'
|
| 4 |
+
|
| 5 |
+
out_dir = 'out/gpt2-new'
|
| 6 |
+
eval_interval = 500
|
| 7 |
+
log_interval = 1
|
| 8 |
+
eval_iters = 500
|
| 9 |
+
eval_only = False
|
| 10 |
+
always_save_checkpoint = False
|
| 11 |
+
init_from = 'scratch'
|
| 12 |
+
|
| 13 |
+
wandb_log = False
|
| 14 |
+
wandb_project = 'owt'
|
| 15 |
+
wandb_run_name = 'gpt2'
|
| 16 |
+
|
| 17 |
+
dataset = 'openwebtext'
|
| 18 |
+
gradient_accumulation_steps = 64
|
| 19 |
+
batch_size = 1
|
| 20 |
+
block_size = 1024
|
| 21 |
+
|
| 22 |
+
n_layer = 36
|
| 23 |
+
n_head = 16
|
| 24 |
+
n_embd = 1024
|
| 25 |
+
vocab_size = 50304
|
| 26 |
+
dropout = 0.0
|
| 27 |
+
bias = False
|
| 28 |
+
|
| 29 |
+
learning_rate = 3e-4
|
| 30 |
+
max_iters = 50000
|
| 31 |
+
weight_decay = 1e-1
|
| 32 |
+
beta1 = 0.9
|
| 33 |
+
beta2 = 0.95
|
| 34 |
+
grad_clip = 1.0
|
| 35 |
+
|
| 36 |
+
decay_lr = True
|
| 37 |
+
warmup_iters = 1000
|
| 38 |
+
lr_decay_iters = 50000
|
| 39 |
+
min_lr = 3e-5
|
| 40 |
+
|
| 41 |
+
backend = 'nccl'
|
| 42 |
+
|
| 43 |
+
device = 'cuda'
|
| 44 |
+
dtype = 'float16'
|
| 45 |
+
compile = True
|
config/train_gpt2_new_nolr.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
model_config = 'gpt2-new'
|
| 2 |
+
|
| 3 |
+
log_file = 'logs/gpt2-new-nolr.log'
|
| 4 |
+
|
| 5 |
+
out_dir = 'out/gpt2-new-nolr'
|
| 6 |
+
eval_interval = 500
|
| 7 |
+
log_interval = 1
|
| 8 |
+
eval_iters = 500
|
| 9 |
+
eval_only = False
|
| 10 |
+
always_save_checkpoint = False
|
| 11 |
+
init_from = 'scratch'
|
| 12 |
+
|
| 13 |
+
wandb_log = False
|
| 14 |
+
wandb_project = 'owt'
|
| 15 |
+
wandb_run_name = 'gpt2'
|
| 16 |
+
|
| 17 |
+
dataset = 'openwebtext'
|
| 18 |
+
gradient_accumulation_steps = 64
|
| 19 |
+
batch_size = 1
|
| 20 |
+
block_size = 1024
|
| 21 |
+
|
| 22 |
+
n_layer = 36
|
| 23 |
+
n_head = 16
|
| 24 |
+
n_embd = 1024
|
| 25 |
+
vocab_size = 50304
|
| 26 |
+
dropout = 0.0
|
| 27 |
+
bias = False
|
| 28 |
+
|
| 29 |
+
learning_rate = 1.5e-4
|
| 30 |
+
max_iters = 50000
|
| 31 |
+
weight_decay = 0.1
|
| 32 |
+
beta1 = 0.9
|
| 33 |
+
beta2 = 0.95
|
| 34 |
+
grad_clip = 1.0
|
| 35 |
+
|
| 36 |
+
decay_lr = False
|
| 37 |
+
warmup_iters = 2000
|
| 38 |
+
lr_decay_iters = 600000
|
| 39 |
+
min_lr = 6e-5
|
| 40 |
+
|
| 41 |
+
backend = 'nccl'
|
| 42 |
+
|
| 43 |
+
device = 'cuda'
|
| 44 |
+
dtype = 'float16'
|
| 45 |
+
compile = True
|
config/train_gpt2_new_nolr_resume.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
model_config = 'gpt2-new'
|
| 2 |
+
|
| 3 |
+
log_file = 'logs/gpt2-new-nolr.log'
|
| 4 |
+
|
| 5 |
+
out_dir = 'out/gpt2-new-nolr'
|
| 6 |
+
eval_interval = 500
|
| 7 |
+
log_interval = 1
|
| 8 |
+
eval_iters = 500
|
| 9 |
+
eval_only = False
|
| 10 |
+
always_save_checkpoint = False
|
| 11 |
+
init_from = 'resume'
|
| 12 |
+
|
| 13 |
+
wandb_log = False
|
| 14 |
+
wandb_project = 'owt'
|
| 15 |
+
wandb_run_name = 'gpt2'
|
| 16 |
+
|
| 17 |
+
dataset = 'openwebtext'
|
| 18 |
+
gradient_accumulation_steps = 64
|
| 19 |
+
batch_size = 1
|
| 20 |
+
block_size = 1024
|
| 21 |
+
|
| 22 |
+
n_layer = 36
|
| 23 |
+
n_head = 16
|
| 24 |
+
n_embd = 1024
|
| 25 |
+
vocab_size = 50304
|
| 26 |
+
dropout = 0.0
|
| 27 |
+
bias = False
|
| 28 |
+
|
| 29 |
+
learning_rate = 1.5e-4
|
| 30 |
+
max_iters = 50000
|
| 31 |
+
weight_decay = 0.1
|
| 32 |
+
beta1 = 0.9
|
| 33 |
+
beta2 = 0.95
|
| 34 |
+
grad_clip = 1.0
|
| 35 |
+
|
| 36 |
+
decay_lr = False
|
| 37 |
+
warmup_iters = 2000
|
| 38 |
+
lr_decay_iters = 600000
|
| 39 |
+
min_lr = 6e-5
|
| 40 |
+
|
| 41 |
+
backend = 'nccl'
|
| 42 |
+
|
| 43 |
+
device = 'cuda'
|
| 44 |
+
dtype = 'float16'
|
| 45 |
+
compile = True
|
config/train_gpt2_new_resume.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
model_config = 'gpt2-new'
|
| 2 |
+
|
| 3 |
+
log_file = 'logs/gpt2-new.log'
|
| 4 |
+
|
| 5 |
+
out_dir = 'out/gpt2-new'
|
| 6 |
+
eval_interval = 500
|
| 7 |
+
log_interval = 1
|
| 8 |
+
eval_iters = 500
|
| 9 |
+
eval_only = False
|
| 10 |
+
always_save_checkpoint = False
|
| 11 |
+
init_from = 'resume'
|
| 12 |
+
|
| 13 |
+
wandb_log = False
|
| 14 |
+
wandb_project = 'owt'
|
| 15 |
+
wandb_run_name = 'gpt2'
|
| 16 |
+
|
| 17 |
+
dataset = 'openwebtext'
|
| 18 |
+
gradient_accumulation_steps = 64
|
| 19 |
+
batch_size = 1
|
| 20 |
+
block_size = 1024
|
| 21 |
+
|
| 22 |
+
n_layer = 36
|
| 23 |
+
n_head = 16
|
| 24 |
+
n_embd = 1024
|
| 25 |
+
vocab_size = 50304
|
| 26 |
+
dropout = 0.0
|
| 27 |
+
bias = False
|
| 28 |
+
|
| 29 |
+
learning_rate = 3e-4
|
| 30 |
+
max_iters = 50000
|
| 31 |
+
weight_decay = 1e-1
|
| 32 |
+
beta1 = 0.9
|
| 33 |
+
beta2 = 0.95
|
| 34 |
+
grad_clip = 1.0
|
| 35 |
+
|
| 36 |
+
decay_lr = True
|
| 37 |
+
warmup_iters = 1000
|
| 38 |
+
lr_decay_iters = 50000
|
| 39 |
+
min_lr = 3e-5
|
| 40 |
+
|
| 41 |
+
backend = 'nccl'
|
| 42 |
+
|
| 43 |
+
device = 'cuda'
|
| 44 |
+
dtype = 'float16'
|
| 45 |
+
compile = True
|
config/train_gpt2_resume.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
model_config = 'gpt2'
|
| 2 |
+
|
| 3 |
+
log_file = 'logs/gpt2.log'
|
| 4 |
+
|
| 5 |
+
out_dir = 'out/gpt2'
|
| 6 |
+
eval_interval = 500
|
| 7 |
+
log_interval = 1
|
| 8 |
+
eval_iters = 500
|
| 9 |
+
eval_only = False
|
| 10 |
+
always_save_checkpoint = False
|
| 11 |
+
init_from = 'resume'
|
| 12 |
+
|
| 13 |
+
wandb_log = False
|
| 14 |
+
wandb_project = 'owt'
|
| 15 |
+
wandb_run_name = 'gpt2'
|
| 16 |
+
|
| 17 |
+
dataset = 'openwebtext'
|
| 18 |
+
gradient_accumulation_steps = 64
|
| 19 |
+
batch_size = 1
|
| 20 |
+
block_size = 1024
|
| 21 |
+
|
| 22 |
+
n_layer = 36
|
| 23 |
+
n_head = 16
|
| 24 |
+
n_embd = 1024
|
| 25 |
+
vocab_size = 50304
|
| 26 |
+
dropout = 0.0
|
| 27 |
+
bias = False
|
| 28 |
+
|
| 29 |
+
learning_rate = 3e-4
|
| 30 |
+
max_iters = 50000
|
| 31 |
+
weight_decay = 1e-1
|
| 32 |
+
beta1 = 0.9
|
| 33 |
+
beta2 = 0.95
|
| 34 |
+
grad_clip = 1.0
|
| 35 |
+
|
| 36 |
+
decay_lr = True
|
| 37 |
+
warmup_iters = 1000
|
| 38 |
+
lr_decay_iters = 50000
|
| 39 |
+
min_lr = 3e-5
|
| 40 |
+
|
| 41 |
+
backend = 'nccl'
|
| 42 |
+
|
| 43 |
+
device = 'cuda'
|
| 44 |
+
dtype = 'float16'
|
| 45 |
+
compile = True
|
config/train_reflow_1.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
model_config = 'reflow'
|
| 3 |
+
|
| 4 |
+
log_file = 'logs/reflow-1.log'
|
| 5 |
+
|
| 6 |
+
out_dir = 'out/reflow-1'
|
| 7 |
+
eval_interval = 500
|
| 8 |
+
log_interval = 1
|
| 9 |
+
eval_iters = 500
|
| 10 |
+
eval_only = False
|
| 11 |
+
always_save_checkpoint = False
|
| 12 |
+
init_from = 'scratch'
|
| 13 |
+
|
| 14 |
+
wandb_log = False
|
| 15 |
+
wandb_project = 'reflow-1'
|
| 16 |
+
wandb_run_name = 'reflow-1'
|
| 17 |
+
|
| 18 |
+
dataset = 'openwebtext'
|
| 19 |
+
gradient_accumulation_steps = 64
|
| 20 |
+
batch_size = 1
|
| 21 |
+
block_size = 1024
|
| 22 |
+
|
| 23 |
+
n_layer = 32
|
| 24 |
+
n_head = 16
|
| 25 |
+
n_embd = 1024
|
| 26 |
+
n_signals = 1024
|
| 27 |
+
vocab_size = 50304
|
| 28 |
+
dropout = 0.0
|
| 29 |
+
bias = False
|
| 30 |
+
|
| 31 |
+
learning_rate = 1.5e-4
|
| 32 |
+
max_iters = 50000
|
| 33 |
+
weight_decay = 0.1
|
| 34 |
+
beta1 = 0.9
|
| 35 |
+
beta2 = 0.95
|
| 36 |
+
grad_clip = 1.0
|
| 37 |
+
|
| 38 |
+
decay_lr = False
|
| 39 |
+
warmup_iters = 2000
|
| 40 |
+
lr_decay_iters = 600000
|
| 41 |
+
min_lr = 6e-5
|
| 42 |
+
|
| 43 |
+
backend = 'nccl'
|
| 44 |
+
device = 'cuda'
|
| 45 |
+
dtype = 'float16'
|
| 46 |
+
compile = True
|
config/train_reflow_1_big.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
model_config = 'reflow'
|
| 3 |
+
|
| 4 |
+
log_file = 'logs/reflow-1-big.log'
|
| 5 |
+
|
| 6 |
+
out_dir = 'out/reflow-1-big'
|
| 7 |
+
eval_interval = 500
|
| 8 |
+
log_interval = 1
|
| 9 |
+
eval_iters = 500
|
| 10 |
+
eval_only = False
|
| 11 |
+
always_save_checkpoint = False
|
| 12 |
+
init_from = 'scratch'
|
| 13 |
+
|
| 14 |
+
wandb_log = False
|
| 15 |
+
wandb_project = 'reflow-1-big'
|
| 16 |
+
wandb_run_name = 'reflow-1-big'
|
| 17 |
+
|
| 18 |
+
dataset = 'openwebtext'
|
| 19 |
+
gradient_accumulation_steps = 64
|
| 20 |
+
batch_size = 1
|
| 21 |
+
block_size = 1024
|
| 22 |
+
|
| 23 |
+
n_layer = 36
|
| 24 |
+
n_head = 16
|
| 25 |
+
n_embd = 1024
|
| 26 |
+
n_signals = 1024
|
| 27 |
+
vocab_size = 50304
|
| 28 |
+
dropout = 0.0
|
| 29 |
+
bias = False
|
| 30 |
+
|
| 31 |
+
learning_rate = 3e-4
|
| 32 |
+
max_iters = 50000
|
| 33 |
+
weight_decay = 1e-1
|
| 34 |
+
beta1 = 0.9
|
| 35 |
+
beta2 = 0.95
|
| 36 |
+
grad_clip = 1.0
|
| 37 |
+
|
| 38 |
+
decay_lr = True
|
| 39 |
+
warmup_iters = 1000
|
| 40 |
+
lr_decay_iters = 50000
|
| 41 |
+
min_lr = 3e-5
|
| 42 |
+
|
| 43 |
+
backend = 'nccl'
|
| 44 |
+
device = 'cuda'
|
| 45 |
+
dtype = 'float16'
|
| 46 |
+
compile = True
|
config/train_reflow_1_big_resume.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
model_config = 'reflow'
|
| 3 |
+
|
| 4 |
+
log_file = 'logs/reflow-1-big.log'
|
| 5 |
+
|
| 6 |
+
out_dir = 'out/reflow-1-big'
|
| 7 |
+
eval_interval = 500
|
| 8 |
+
log_interval = 1
|
| 9 |
+
eval_iters = 500
|
| 10 |
+
eval_only = False
|
| 11 |
+
always_save_checkpoint = False
|
| 12 |
+
init_from = 'resume'
|
| 13 |
+
|
| 14 |
+
wandb_log = False
|
| 15 |
+
wandb_project = 'reflow-1-big'
|
| 16 |
+
wandb_run_name = 'reflow-1-big'
|
| 17 |
+
|
| 18 |
+
dataset = 'openwebtext'
|
| 19 |
+
gradient_accumulation_steps = 64
|
| 20 |
+
batch_size = 1
|
| 21 |
+
block_size = 1024
|
| 22 |
+
|
| 23 |
+
n_layer = 36
|
| 24 |
+
n_head = 16
|
| 25 |
+
n_embd = 1024
|
| 26 |
+
n_signals = 1024
|
| 27 |
+
vocab_size = 50304
|
| 28 |
+
dropout = 0.0
|
| 29 |
+
bias = False
|
| 30 |
+
|
| 31 |
+
learning_rate = 3e-4
|
| 32 |
+
max_iters = 50000
|
| 33 |
+
weight_decay = 1e-1
|
| 34 |
+
beta1 = 0.9
|
| 35 |
+
beta2 = 0.95
|
| 36 |
+
grad_clip = 1.0
|
| 37 |
+
|
| 38 |
+
decay_lr = True
|
| 39 |
+
warmup_iters = 1000
|
| 40 |
+
lr_decay_iters = 50000
|
| 41 |
+
min_lr = 3e-5
|
| 42 |
+
|
| 43 |
+
backend = 'nccl'
|
| 44 |
+
device = 'cuda'
|
| 45 |
+
dtype = 'float16'
|
| 46 |
+
compile = True
|
config/train_reflow_1_lite.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
model_config = 'reflow-lite'
|
| 3 |
+
|
| 4 |
+
log_file = 'logs/reflow-1-lite.log'
|
| 5 |
+
|
| 6 |
+
out_dir = 'out/reflow-1-lite'
|
| 7 |
+
eval_interval = 500
|
| 8 |
+
log_interval = 1
|
| 9 |
+
eval_iters = 500
|
| 10 |
+
eval_only = False
|
| 11 |
+
always_save_checkpoint = False
|
| 12 |
+
init_from = 'scratch'
|
| 13 |
+
|
| 14 |
+
wandb_log = False
|
| 15 |
+
wandb_project = 'reflow-lite'
|
| 16 |
+
wandb_run_name = 'reflow-lite'
|
| 17 |
+
|
| 18 |
+
dataset = 'openwebtext'
|
| 19 |
+
gradient_accumulation_steps = 40
|
| 20 |
+
batch_size = 1
|
| 21 |
+
block_size = 1024
|
| 22 |
+
|
| 23 |
+
n_layer = 32
|
| 24 |
+
n_head = 16
|
| 25 |
+
n_embd = 1024
|
| 26 |
+
n_signals = 1024
|
| 27 |
+
vocab_size = 50304
|
| 28 |
+
dropout = 0.0
|
| 29 |
+
bias = False
|
| 30 |
+
|
| 31 |
+
mlp_expansion_ratio = 2.66
|
| 32 |
+
n_kv_head = 4
|
| 33 |
+
|
| 34 |
+
learning_rate = 1e-4
|
| 35 |
+
max_iters = 100000
|
| 36 |
+
weight_decay = 0.1
|
| 37 |
+
beta1 = 0.9
|
| 38 |
+
beta2 = 0.95
|
| 39 |
+
grad_clip = 1.0
|
| 40 |
+
|
| 41 |
+
decay_lr = True
|
| 42 |
+
warmup_iters = 2000
|
| 43 |
+
lr_decay_iters = 100000
|
| 44 |
+
min_lr = 1e-5
|
| 45 |
+
|
| 46 |
+
backend = 'nccl'
|
| 47 |
+
device = 'cuda'
|
| 48 |
+
dtype = 'bfloat16'
|
| 49 |
+
compile = True
|
config/train_reflow_1_lite_resume.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
model_config = 'reflow-lite'
|
| 3 |
+
|
| 4 |
+
log_file = 'logs/reflow-1-lite.log'
|
| 5 |
+
|
| 6 |
+
out_dir = 'out/reflow-1-lite'
|
| 7 |
+
eval_interval = 500
|
| 8 |
+
log_interval = 1
|
| 9 |
+
eval_iters = 500
|
| 10 |
+
eval_only = False
|
| 11 |
+
always_save_checkpoint = False
|
| 12 |
+
init_from = 'resume'
|
| 13 |
+
|
| 14 |
+
wandb_log = False
|
| 15 |
+
wandb_project = 'reflow-lite'
|
| 16 |
+
wandb_run_name = 'reflow-lite'
|
| 17 |
+
|
| 18 |
+
dataset = 'openwebtext'
|
| 19 |
+
gradient_accumulation_steps = 40
|
| 20 |
+
batch_size = 1
|
| 21 |
+
block_size = 1024
|
| 22 |
+
|
| 23 |
+
n_layer = 32
|
| 24 |
+
n_head = 16
|
| 25 |
+
n_embd = 1024
|
| 26 |
+
n_signals = 1024
|
| 27 |
+
vocab_size = 50304
|
| 28 |
+
dropout = 0.0
|
| 29 |
+
bias = False
|
| 30 |
+
|
| 31 |
+
mlp_expansion_ratio = 2.66
|
| 32 |
+
n_kv_head = 4
|
| 33 |
+
|
| 34 |
+
learning_rate = 1e-4
|
| 35 |
+
max_iters = 100000
|
| 36 |
+
weight_decay = 0.1
|
| 37 |
+
beta1 = 0.9
|
| 38 |
+
beta2 = 0.95
|
| 39 |
+
grad_clip = 1.0
|
| 40 |
+
|
| 41 |
+
decay_lr = True
|
| 42 |
+
warmup_iters = 2000
|
| 43 |
+
lr_decay_iters = 100000
|
| 44 |
+
min_lr = 1e-5
|
| 45 |
+
|
| 46 |
+
backend = 'nccl'
|
| 47 |
+
device = 'cuda'
|
| 48 |
+
dtype = 'bfloat16'
|
| 49 |
+
compile = True
|
config/train_reflow_1_resume.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
model_config = 'reflow'
|
| 3 |
+
|
| 4 |
+
log_file = 'logs/reflow-1.log'
|
| 5 |
+
|
| 6 |
+
out_dir = 'out/reflow-1'
|
| 7 |
+
eval_interval = 500
|
| 8 |
+
log_interval = 1
|
| 9 |
+
eval_iters = 500
|
| 10 |
+
eval_only = False
|
| 11 |
+
always_save_checkpoint = False
|
| 12 |
+
init_from = 'resume'
|
| 13 |
+
|
| 14 |
+
wandb_log = False
|
| 15 |
+
wandb_project = 'reflow-1'
|
| 16 |
+
wandb_run_name = 'reflow-1'
|
| 17 |
+
|
| 18 |
+
dataset = 'openwebtext'
|
| 19 |
+
gradient_accumulation_steps = 64
|
| 20 |
+
batch_size = 1
|
| 21 |
+
block_size = 1024
|
| 22 |
+
|
| 23 |
+
n_layer = 32
|
| 24 |
+
n_head = 16
|
| 25 |
+
n_embd = 1024
|
| 26 |
+
n_signals = 1024
|
| 27 |
+
vocab_size = 50304
|
| 28 |
+
dropout = 0.0
|
| 29 |
+
bias = False
|
| 30 |
+
|
| 31 |
+
learning_rate = 1.5e-4
|
| 32 |
+
max_iters = 50000
|
| 33 |
+
weight_decay = 0.1
|
| 34 |
+
beta1 = 0.9
|
| 35 |
+
beta2 = 0.95
|
| 36 |
+
grad_clip = 1.0
|
| 37 |
+
|
| 38 |
+
decay_lr = False
|
| 39 |
+
warmup_iters = 2000
|
| 40 |
+
lr_decay_iters = 600000
|
| 41 |
+
min_lr = 6e-5
|
| 42 |
+
|
| 43 |
+
backend = 'nccl'
|
| 44 |
+
device = 'cuda'
|
| 45 |
+
dtype = 'float16'
|
| 46 |
+
compile = True
|
config/train_reflow_1_small.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
model_config = 'reflow'
|
| 3 |
+
|
| 4 |
+
log_file = 'logs/reflow-1-small.log'
|
| 5 |
+
|
| 6 |
+
out_dir = 'out/reflow-1-small'
|
| 7 |
+
eval_interval = 500
|
| 8 |
+
log_interval = 1
|
| 9 |
+
eval_iters = 500
|
| 10 |
+
eval_only = False
|
| 11 |
+
always_save_checkpoint = False
|
| 12 |
+
init_from = 'scratch'
|
| 13 |
+
|
| 14 |
+
wandb_log = False
|
| 15 |
+
wandb_project = 'reflow-1-small'
|
| 16 |
+
wandb_run_name = 'reflow-1-small'
|
| 17 |
+
|
| 18 |
+
dataset = 'openwebtext'
|
| 19 |
+
gradient_accumulation_steps = 64
|
| 20 |
+
batch_size = 1
|
| 21 |
+
block_size = 1024
|
| 22 |
+
|
| 23 |
+
n_layer = 6
|
| 24 |
+
n_head = 8
|
| 25 |
+
n_embd = 512
|
| 26 |
+
n_signals = 512
|
| 27 |
+
vocab_size = 50304
|
| 28 |
+
dropout = 0.0
|
| 29 |
+
bias = False
|
| 30 |
+
|
| 31 |
+
learning_rate = 1.5e-4
|
| 32 |
+
max_iters = 50000
|
| 33 |
+
weight_decay = 0.1
|
| 34 |
+
beta1 = 0.9
|
| 35 |
+
beta2 = 0.95
|
| 36 |
+
grad_clip = 1.0
|
| 37 |
+
|
| 38 |
+
decay_lr = False
|
| 39 |
+
warmup_iters = 2000
|
| 40 |
+
lr_decay_iters = 600000
|
| 41 |
+
min_lr = 6e-5
|
| 42 |
+
|
| 43 |
+
backend = 'nccl'
|
| 44 |
+
device = 'cuda'
|
| 45 |
+
dtype = 'float16'
|
| 46 |
+
compile = True
|
config/train_reflow_1_small_resume.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
model_config = 'reflow'
|
| 3 |
+
|
| 4 |
+
log_file = 'logs/reflow-1-small.log'
|
| 5 |
+
|
| 6 |
+
out_dir = 'out/reflow-1-small'
|
| 7 |
+
eval_interval = 500
|
| 8 |
+
log_interval = 1
|
| 9 |
+
eval_iters = 500
|
| 10 |
+
eval_only = False
|
| 11 |
+
always_save_checkpoint = False
|
| 12 |
+
init_from = 'resume'
|
| 13 |
+
|
| 14 |
+
wandb_log = False
|
| 15 |
+
wandb_project = 'reflow-1-small'
|
| 16 |
+
wandb_run_name = 'reflow-1-small'
|
| 17 |
+
|
| 18 |
+
dataset = 'openwebtext'
|
| 19 |
+
gradient_accumulation_steps = 64
|
| 20 |
+
batch_size = 1
|
| 21 |
+
block_size = 1024
|
| 22 |
+
|
| 23 |
+
n_layer = 6
|
| 24 |
+
n_head = 8
|
| 25 |
+
n_embd = 512
|
| 26 |
+
n_signals = 512
|
| 27 |
+
vocab_size = 50304
|
| 28 |
+
dropout = 0.0
|
| 29 |
+
bias = False
|
| 30 |
+
|
| 31 |
+
learning_rate = 1.5e-4
|
| 32 |
+
max_iters = 50000
|
| 33 |
+
weight_decay = 0.1
|
| 34 |
+
beta1 = 0.9
|
| 35 |
+
beta2 = 0.95
|
| 36 |
+
grad_clip = 1.0
|
| 37 |
+
|
| 38 |
+
decay_lr = False
|
| 39 |
+
warmup_iters = 2000
|
| 40 |
+
lr_decay_iters = 600000
|
| 41 |
+
min_lr = 6e-5
|
| 42 |
+
|
| 43 |
+
backend = 'nccl'
|
| 44 |
+
device = 'cuda'
|
| 45 |
+
dtype = 'float16'
|
| 46 |
+
compile = True
|
config/train_reflow_1_small_sp.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
model_config = 'reflow'
|
| 3 |
+
|
| 4 |
+
log_file = 'logs/reflow-1-small-sp.log'
|
| 5 |
+
|
| 6 |
+
out_dir = 'out/reflow-1-small-sp'
|
| 7 |
+
eval_interval = 500
|
| 8 |
+
log_interval = 1
|
| 9 |
+
eval_iters = 500
|
| 10 |
+
eval_only = False
|
| 11 |
+
always_save_checkpoint = False
|
| 12 |
+
init_from = 'scratch'
|
| 13 |
+
|
| 14 |
+
wandb_log = False
|
| 15 |
+
wandb_project = 'reflow-1-small-sp'
|
| 16 |
+
wandb_run_name = 'reflow-1-small-sp'
|
| 17 |
+
|
| 18 |
+
dataset = 'openwebtext'
|
| 19 |
+
gradient_accumulation_steps = 64
|
| 20 |
+
batch_size = 1
|
| 21 |
+
block_size = 1024
|
| 22 |
+
|
| 23 |
+
n_layer = 12
|
| 24 |
+
n_head = 6
|
| 25 |
+
n_embd = 384
|
| 26 |
+
n_signals = 384
|
| 27 |
+
vocab_size = 50304
|
| 28 |
+
dropout = 0.0
|
| 29 |
+
bias = False
|
| 30 |
+
|
| 31 |
+
learning_rate = 1.5e-4
|
| 32 |
+
max_iters = 50000
|
| 33 |
+
weight_decay = 0.1
|
| 34 |
+
beta1 = 0.9
|
| 35 |
+
beta2 = 0.95
|
| 36 |
+
grad_clip = 1.0
|
| 37 |
+
|
| 38 |
+
decay_lr = False
|
| 39 |
+
warmup_iters = 2000
|
| 40 |
+
lr_decay_iters = 600000
|
| 41 |
+
min_lr = 6e-5
|
| 42 |
+
|
| 43 |
+
backend = 'nccl'
|
| 44 |
+
device = 'cuda'
|
| 45 |
+
dtype = 'float16'
|
| 46 |
+
compile = True
|
config/train_reflow_1_small_sp_resume.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
model_config = 'reflow'
|
| 3 |
+
|
| 4 |
+
log_file = 'logs/reflow-1-small-sp.log'
|
| 5 |
+
|
| 6 |
+
out_dir = 'out/reflow-1-small-sp'
|
| 7 |
+
eval_interval = 500
|
| 8 |
+
log_interval = 1
|
| 9 |
+
eval_iters = 500
|
| 10 |
+
eval_only = False
|
| 11 |
+
always_save_checkpoint = False
|
| 12 |
+
init_from = 'resume'
|
| 13 |
+
|
| 14 |
+
wandb_log = False
|
| 15 |
+
wandb_project = 'reflow-1-small-sp'
|
| 16 |
+
wandb_run_name = 'reflow-1-small-sp'
|
| 17 |
+
|
| 18 |
+
dataset = 'openwebtext'
|
| 19 |
+
gradient_accumulation_steps = 64
|
| 20 |
+
batch_size = 1
|
| 21 |
+
block_size = 1024
|
| 22 |
+
|
| 23 |
+
n_layer = 12
|
| 24 |
+
n_head = 6
|
| 25 |
+
n_embd = 384
|
| 26 |
+
n_signals = 384
|
| 27 |
+
vocab_size = 50304
|
| 28 |
+
dropout = 0.0
|
| 29 |
+
bias = False
|
| 30 |
+
|
| 31 |
+
learning_rate = 1.5e-4
|
| 32 |
+
max_iters = 50000
|
| 33 |
+
weight_decay = 0.1
|
| 34 |
+
beta1 = 0.9
|
| 35 |
+
beta2 = 0.95
|
| 36 |
+
grad_clip = 1.0
|
| 37 |
+
|
| 38 |
+
decay_lr = False
|
| 39 |
+
warmup_iters = 2000
|
| 40 |
+
lr_decay_iters = 600000
|
| 41 |
+
min_lr = 6e-5
|
| 42 |
+
|
| 43 |
+
backend = 'nccl'
|
| 44 |
+
device = 'cuda'
|
| 45 |
+
dtype = 'float16'
|
| 46 |
+
compile = True
|
config/train_reflow_1_topk.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
model_config = 'reflow-topk'
|
| 3 |
+
|
| 4 |
+
log_file = 'logs/reflow-1-topk.log'
|
| 5 |
+
|
| 6 |
+
out_dir = 'out/reflow-1-topk'
|
| 7 |
+
eval_interval = 500
|
| 8 |
+
log_interval = 1
|
| 9 |
+
eval_iters = 500
|
| 10 |
+
eval_only = False
|
| 11 |
+
always_save_checkpoint = False
|
| 12 |
+
init_from = 'scratch'
|
| 13 |
+
|
| 14 |
+
wandb_log = False
|
| 15 |
+
wandb_project = 'reflow-1-topk'
|
| 16 |
+
wandb_run_name = 'reflow-1-topk'
|
| 17 |
+
|
| 18 |
+
dataset = 'openwebtext'
|
| 19 |
+
gradient_accumulation_steps = 64
|
| 20 |
+
batch_size = 1
|
| 21 |
+
block_size = 1024
|
| 22 |
+
|
| 23 |
+
n_layer = 6
|
| 24 |
+
n_head = 8
|
| 25 |
+
n_embd = 512
|
| 26 |
+
n_signals = 512
|
| 27 |
+
vocab_size = 50304
|
| 28 |
+
dropout = 0.0
|
| 29 |
+
bias = False
|
| 30 |
+
max_active_signals = 32
|
| 31 |
+
sparsity_penalty = 1e-3
|
| 32 |
+
|
| 33 |
+
learning_rate = 3e-4
|
| 34 |
+
max_iters = 50000
|
| 35 |
+
weight_decay = 1e-1
|
| 36 |
+
beta1 = 0.9
|
| 37 |
+
beta2 = 0.95
|
| 38 |
+
grad_clip = 1.0
|
| 39 |
+
|
| 40 |
+
decay_lr = True
|
| 41 |
+
warmup_iters = 1000
|
| 42 |
+
lr_decay_iters = 50000
|
| 43 |
+
min_lr = 3e-5
|
| 44 |
+
|
| 45 |
+
backend = 'nccl'
|
| 46 |
+
device = 'cuda'
|
| 47 |
+
dtype = 'float16'
|
| 48 |
+
compile = True
|
config/train_reflow_1_topk_big.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
model_config = 'reflow-topk'
|
| 3 |
+
|
| 4 |
+
log_file = 'logs/reflow-1-topk-big.log'
|
| 5 |
+
|
| 6 |
+
out_dir = 'out/reflow-1-topk-big'
|
| 7 |
+
eval_interval = 500
|
| 8 |
+
log_interval = 1
|
| 9 |
+
eval_iters = 500
|
| 10 |
+
eval_only = False
|
| 11 |
+
always_save_checkpoint = False
|
| 12 |
+
init_from = 'scratch'
|
| 13 |
+
|
| 14 |
+
wandb_log = False
|
| 15 |
+
wandb_project = 'reflow-1-topk-big'
|
| 16 |
+
wandb_run_name = 'reflow-1-topk-big'
|
| 17 |
+
|
| 18 |
+
dataset = 'openwebtext'
|
| 19 |
+
gradient_accumulation_steps = 64
|
| 20 |
+
batch_size = 1
|
| 21 |
+
block_size = 1024
|
| 22 |
+
|
| 23 |
+
n_layer = 36
|
| 24 |
+
n_head = 16
|
| 25 |
+
n_embd = 1024
|
| 26 |
+
n_signals = 1024
|
| 27 |
+
vocab_size = 50304
|
| 28 |
+
dropout = 0.0
|
| 29 |
+
bias = False
|
| 30 |
+
max_active_signals = 64
|
| 31 |
+
sparsity_penalty = 1e-3
|
| 32 |
+
|
| 33 |
+
learning_rate = 3e-4
|
| 34 |
+
max_iters = 50000
|
| 35 |
+
weight_decay = 1e-1
|
| 36 |
+
beta1 = 0.9
|
| 37 |
+
beta2 = 0.95
|
| 38 |
+
grad_clip = 1.0
|
| 39 |
+
|
| 40 |
+
decay_lr = True
|
| 41 |
+
warmup_iters = 1000
|
| 42 |
+
lr_decay_iters = 50000
|
| 43 |
+
min_lr = 3e-5
|
| 44 |
+
|
| 45 |
+
backend = 'nccl'
|
| 46 |
+
device = 'cuda'
|
| 47 |
+
dtype = 'float16'
|
| 48 |
+
compile = True
|
config/train_reflow_1_topk_big_resume.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
model_config = 'reflow-topk'
|
| 3 |
+
|
| 4 |
+
log_file = 'logs/reflow-1-topk-big.log'
|
| 5 |
+
|
| 6 |
+
out_dir = 'out/reflow-1-topk-big'
|
| 7 |
+
eval_interval = 500
|
| 8 |
+
log_interval = 1
|
| 9 |
+
eval_iters = 500
|
| 10 |
+
eval_only = False
|
| 11 |
+
always_save_checkpoint = False
|
| 12 |
+
init_from = 'resume'
|
| 13 |
+
|
| 14 |
+
wandb_log = False
|
| 15 |
+
wandb_project = 'reflow-1-topk-big'
|
| 16 |
+
wandb_run_name = 'reflow-1-topk-big'
|
| 17 |
+
|
| 18 |
+
dataset = 'openwebtext'
|
| 19 |
+
gradient_accumulation_steps = 64
|
| 20 |
+
batch_size = 1
|
| 21 |
+
block_size = 1024
|
| 22 |
+
|
| 23 |
+
n_layer = 36
|
| 24 |
+
n_head = 16
|
| 25 |
+
n_embd = 1024
|
| 26 |
+
n_signals = 1024
|
| 27 |
+
vocab_size = 50304
|
| 28 |
+
dropout = 0.0
|
| 29 |
+
bias = False
|
| 30 |
+
max_active_signals = 64
|
| 31 |
+
sparsity_penalty = 1e-3
|
| 32 |
+
|
| 33 |
+
learning_rate = 3e-4
|
| 34 |
+
max_iters = 50000
|
| 35 |
+
weight_decay = 1e-1
|
| 36 |
+
beta1 = 0.9
|
| 37 |
+
beta2 = 0.95
|
| 38 |
+
grad_clip = 1.0
|
| 39 |
+
|
| 40 |
+
decay_lr = True
|
| 41 |
+
warmup_iters = 1000
|
| 42 |
+
lr_decay_iters = 50000
|
| 43 |
+
min_lr = 3e-5
|
| 44 |
+
|
| 45 |
+
backend = 'nccl'
|
| 46 |
+
device = 'cuda'
|
| 47 |
+
dtype = 'float16'
|
| 48 |
+
compile = True
|
config/train_reflow_base.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
model_config = 'reflow'
|
| 3 |
+
|
| 4 |
+
out_dir = 'out-reflow'
|
| 5 |
+
eval_interval = 250
|
| 6 |
+
log_interval = 10
|
| 7 |
+
eval_iters = 200
|
| 8 |
+
eval_only = False
|
| 9 |
+
always_save_checkpoint = False
|
| 10 |
+
init_from = 'scratch'
|
| 11 |
+
|
| 12 |
+
wandb_log = False
|
| 13 |
+
wandb_project = 'reflow'
|
| 14 |
+
wandb_run_name = 'reflow'
|
| 15 |
+
|
| 16 |
+
dataset = 'openwebtext'
|
| 17 |
+
gradient_accumulation_steps = 64
|
| 18 |
+
batch_size = 1
|
| 19 |
+
block_size = 1024
|
| 20 |
+
|
| 21 |
+
n_layer = 32
|
| 22 |
+
n_head = 16
|
| 23 |
+
n_embd = 1024
|
| 24 |
+
n_signals = 1024
|
| 25 |
+
dropout = 0.0
|
| 26 |
+
bias = False
|
| 27 |
+
|
| 28 |
+
learning_rate = 1.5e-4
|
| 29 |
+
max_iters = 50000
|
| 30 |
+
weight_decay = 0.1
|
| 31 |
+
beta1 = 0.9
|
| 32 |
+
beta2 = 0.95
|
| 33 |
+
grad_clip = 1.0
|
| 34 |
+
|
| 35 |
+
decay_lr = False
|
| 36 |
+
warmup_iters = 2000
|
| 37 |
+
lr_decay_iters = 600000
|
| 38 |
+
min_lr = 6e-5
|
| 39 |
+
|
| 40 |
+
backend = 'nccl'
|
| 41 |
+
device = 'cuda'
|
| 42 |
+
dtype = 'float16'
|
| 43 |
+
compile = True
|
config/train_resume.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import time
|
| 3 |
+
|
| 4 |
+
model_config = 'gpt2'
|
| 5 |
+
|
| 6 |
+
out_dir = 'out-web'
|
| 7 |
+
eval_interval = 50
|
| 8 |
+
log_interval = 1
|
| 9 |
+
eval_iters = 40
|
| 10 |
+
eval_only = False
|
| 11 |
+
always_save_checkpoint = False
|
| 12 |
+
|
| 13 |
+
wandb_log = True
|
| 14 |
+
wandb_project = 'resume'
|
| 15 |
+
wandb_run_name = 'resume-' + str(time.time())
|
| 16 |
+
|
| 17 |
+
dataset = 'openwebtext'
|
| 18 |
+
gradient_accumulation_steps = 64
|
| 19 |
+
batch_size = 1
|
| 20 |
+
block_size = 1024
|
| 21 |
+
|
| 22 |
+
n_layer = 12
|
| 23 |
+
n_head = 12
|
| 24 |
+
n_embd = 768
|
| 25 |
+
dropout = 0.0
|
| 26 |
+
bias = False
|
| 27 |
+
|
| 28 |
+
learning_rate = 1e-6
|
| 29 |
+
max_iters = 20
|
| 30 |
+
weight_decay = 0.1
|
| 31 |
+
beta1 = 0.9
|
| 32 |
+
beta2 = 0.95
|
| 33 |
+
grad_clip = 1.0
|
| 34 |
+
|
| 35 |
+
decay_lr = True
|
| 36 |
+
warmup_iters = 0
|
| 37 |
+
lr_decay_iters = 20
|
| 38 |
+
min_lr = 1e-6
|
| 39 |
+
|
| 40 |
+
backend = 'nccl'
|
| 41 |
+
device = 'cuda'
|
| 42 |
+
dtype = 'bfloat16'
|
| 43 |
+
compile = True
|
| 44 |
+
init_from = 'resume'
|
config/train_sft.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
model_config = 'gpt2'
|
| 3 |
+
|
| 4 |
+
out_dir = 'out-sft'
|
| 5 |
+
base_model_dir = 'out'
|
| 6 |
+
init_from = 'finetune'
|
| 7 |
+
|
| 8 |
+
eval_interval = 250
|
| 9 |
+
log_interval = 1
|
| 10 |
+
eval_iters = 40
|
| 11 |
+
eval_only = False
|
| 12 |
+
always_save_checkpoint = True
|
| 13 |
+
|
| 14 |
+
dataset = 'alpaca'
|
| 15 |
+
gradient_accumulation_steps = 4
|
| 16 |
+
batch_size = 4
|
| 17 |
+
block_size = 1024
|
| 18 |
+
|
| 19 |
+
n_layer = 12
|
| 20 |
+
n_head = 12
|
| 21 |
+
n_embd = 768
|
| 22 |
+
dropout = 0.1
|
| 23 |
+
bias = False
|
| 24 |
+
|
| 25 |
+
learning_rate = 2e-5
|
| 26 |
+
max_iters = 3000
|
| 27 |
+
weight_decay = 0.1
|
| 28 |
+
beta1 = 0.9
|
| 29 |
+
beta2 = 0.95
|
| 30 |
+
grad_clip = 1.0
|
| 31 |
+
|
| 32 |
+
decay_lr = True
|
| 33 |
+
warmup_iters = 100
|
| 34 |
+
lr_decay_iters = 3000
|
| 35 |
+
min_lr = 1e-6
|
| 36 |
+
|
| 37 |
+
sft_masking = True
|
| 38 |
+
|
| 39 |
+
backend = 'nccl'
|
| 40 |
+
device = 'cuda'
|
| 41 |
+
dtype = 'bfloat16'
|
| 42 |
+
compile = True
|
config/train_shakespeare_char.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
model_config = 'gpt2'
|
| 3 |
+
|
| 4 |
+
out_dir = 'out-shakespeare-char'
|
| 5 |
+
eval_interval = 250
|
| 6 |
+
log_interval = 10
|
| 7 |
+
eval_iters = 200
|
| 8 |
+
eval_only = False
|
| 9 |
+
always_save_checkpoint = False
|
| 10 |
+
|
| 11 |
+
wandb_log = False
|
| 12 |
+
wandb_project = 'shakespeare-char'
|
| 13 |
+
wandb_run_name = 'mini-gpt'
|
| 14 |
+
|
| 15 |
+
dataset = 'shakespeare_char'
|
| 16 |
+
gradient_accumulation_steps = 1
|
| 17 |
+
batch_size = 64
|
| 18 |
+
block_size = 256
|
| 19 |
+
|
| 20 |
+
n_layer = 6
|
| 21 |
+
n_head = 6
|
| 22 |
+
n_embd = 384
|
| 23 |
+
dropout = 0.2
|
| 24 |
+
bias = False
|
| 25 |
+
|
| 26 |
+
learning_rate = 1e-3
|
| 27 |
+
max_iters = 5000
|
| 28 |
+
weight_decay = 1e-1
|
| 29 |
+
beta1 = 0.9
|
| 30 |
+
beta2 = 0.99
|
| 31 |
+
grad_clip = 1.0
|
| 32 |
+
|
| 33 |
+
decay_lr = True
|
| 34 |
+
warmup_iters = 100
|
| 35 |
+
lr_decay_iters = 5000
|
| 36 |
+
min_lr = 1e-4
|
| 37 |
+
|
| 38 |
+
backend = 'nccl'
|
| 39 |
+
device = 'cuda'
|
| 40 |
+
dtype = 'bfloat16'
|
| 41 |
+
compile = True
|
| 42 |
+
init_from = 'scratch'
|