reuAC commited on Mar 16

Commit

672259a

verified ·

1 Parent(s): 9bb37d1

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +14 -35
.gitignore +11 -0
LICENSE +21 -0
README.md +172 -0
README_CN.md +199 -0
bench.py +182 -0
check.py +122 -0
config/base.py +41 -0
config/bench_base.py +14 -0
config/bench_gpt2.py +18 -0
config/eval_gpt2.py +5 -0
config/eval_gpt2_large.py +5 -0
config/eval_gpt2_medium.py +5 -0
config/eval_gpt2_xl.py +5 -0
config/finetune_shakespeare.py +45 -0
config/sample_base.py +12 -0
config/sample_gpt2.py +17 -0
config/sample_gpt2_new.py +17 -0
config/sample_gpt2_new_nolr.py +17 -0
config/sample_reflow_1.py +17 -0
config/sample_reflow_1_big.py +17 -0
config/sample_reflow_1_lite.py +17 -0
config/sample_reflow_1_small.py +17 -0
config/sample_reflow_1_small_sp.py +17 -0
config/sample_reflow_1_topk.py +17 -0
config/sample_reflow_1_topk_big.py +17 -0
config/sample_sft_reflow_1.py +17 -0
config/train_gpt2.py +45 -0
config/train_gpt2_new.py +45 -0
config/train_gpt2_new_nolr.py +45 -0
config/train_gpt2_new_nolr_resume.py +45 -0
config/train_gpt2_new_resume.py +45 -0
config/train_gpt2_resume.py +45 -0
config/train_reflow_1.py +46 -0
config/train_reflow_1_big.py +46 -0
config/train_reflow_1_big_resume.py +46 -0
config/train_reflow_1_lite.py +49 -0
config/train_reflow_1_lite_resume.py +49 -0
config/train_reflow_1_resume.py +46 -0
config/train_reflow_1_small.py +46 -0
config/train_reflow_1_small_resume.py +46 -0
config/train_reflow_1_small_sp.py +46 -0
config/train_reflow_1_small_sp_resume.py +46 -0
config/train_reflow_1_topk.py +48 -0
config/train_reflow_1_topk_big.py +48 -0
config/train_reflow_1_topk_big_resume.py +48 -0
config/train_reflow_base.py +43 -0
config/train_resume.py +44 -0
config/train_sft.py +42 -0
config/train_shakespeare_char.py +42 -0

.gitattributes CHANGED Viewed

@@ -1,35 +1,14 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

+# Override jupyter in Github language stats for more accurate estimate of repo code languages
+# reference: https://github.com/github/linguist/blob/master/docs/overrides.md#generated-code
+*.ipynb linguist-generated
+out/gpt2/ckpt.pt filter=lfs diff=lfs merge=lfs -text
+out/gpt2-new/ckpt.pt filter=lfs diff=lfs merge=lfs -text
+out/reflow-1/ckpt.pt filter=lfs diff=lfs merge=lfs -text
+out/reflow-1-big/ckpt.pt filter=lfs diff=lfs merge=lfs -text
+out/reflow-1-lite/ckpt.pt filter=lfs diff=lfs merge=lfs -text
+out/reflow-1-small/ckpt.pt filter=lfs diff=lfs merge=lfs -text
+out/reflow-1-small-sp/ckpt.pt filter=lfs diff=lfs merge=lfs -text
+out/reflow-1-topk/ckpt.pt filter=lfs diff=lfs merge=lfs -text
+out/reflow-1-topk-big/ckpt.pt filter=lfs diff=lfs merge=lfs -text
+paper/paper-cn.pdf filter=lfs diff=lfs merge=lfs -text
+paper/paper.pdf filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,11 @@

+.DS_Store
+.idea
+.ipynb_checkpoints/
+.vscode
+__pycache__/
+*.bin
+*.pkl
+*.pyc
+input.txt
+env/
+venv/

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2022 Andrej Karpathy
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md ADDED Viewed

	@@ -0,0 +1,172 @@

+---
+license: mit
+language:
+- en
+- zh
+tags:
+- transformer
+- interpretability
+- mechanistic-interpretability
+- language-model
+- signal-decomposition
+- sparse-representations
+- pytorch
+datasets:
+- openwebtext
+pipeline_tag: text-generation
+---
+# reFlow
+**A Metal Soul In My Hand** — A feature-decoupled Transformer architecture with native interpretability.
+reFlow reconstructs the traditional full-rank embedding matrix into the product of a **Recipe Matrix** $W_{recipe} \in \mathbb{R}^{V \times S}$ and a **Signal Basis Matrix** $W_{basis} \in \mathbb{R}^{S \times d}$, forcing the model to maintain a set of continuous, low-redundancy signal bases in latent space. A dynamic vocabulary matrix $W_{vocab} = W_{recipe} \times W_{basis}$ is reconstructed in real-time at each forward pass, serving simultaneously as both the embedding matrix and the output projection matrix.
+> **Paper**: [English (PDF)](./paper/paper.pdf) | [中文 (PDF)](./paper/paper-cn.pdf)
+## Project Structure
+```
+reFlow/
+├── train.py              # Training script (single GPU / DDP)
+├── sample.py             # Text generation from trained models
+├── experiment.py          # 12-experiment interpretability suite (Chinese)
+├── experiment_en.py       # 12-experiment interpretability suite (English)
+├── check.py              # Checkpoint parameter inspector
+├── bench.py              # Performance benchmarking
+├── models/
+│   ├── gpt2.py           # Standard GPT-2 baseline
+│   ├── gpt2-new.py       # Modernized GPT-2 (RoPE + SwiGLU + RMSNorm)
+│   ├── reflow.py         # reFlow base architecture
+│   ├── reflow-topk.py    # reFlow with ReLU + Top-K hard sparsity
+│   └── reflow-lite.py    # reFlow with GQA + reduced MLP
+├── config/               # Training / sampling / eval configurations
+├── data/
+│   ├── openwebtext/      # OpenWebText dataset preparation
+│   └── sft-lima/         # LIMA SFT dataset preparation
+└── out/                  # Checkpoints and experiment reports
+```
+## Installation
+### Prerequisites
+- Python 3.10+
+- CUDA-compatible GPU (tested on Tesla T4 x4)
+### 1. PyTorch (CUDA 12.8)
+```bash
+pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
+```
+> Adjust the CUDA version in the URL to match your driver. See [PyTorch Get Started](https://pytorch.org/get-started/locally/).
+### 2. Core Dependencies
+```bash
+pip install datasets tiktoken wandb tqdm
+```
+### 3. Experiment Suite Dependencies
+The interpretability experiments (`experiment.py`) require additional packages:
+```bash
+pip install numpy matplotlib seaborn scikit-learn scipy adjustText
+```
+### Quick Install (All-in-One)
+```bash
+pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
+pip install datasets tiktoken wandb tqdm numpy matplotlib seaborn scikit-learn scipy adjustText
+```
+## Data Preparation
+### OpenWebText
+```bash
+python data/openwebtext/prepare.py
+```
+This downloads the OpenWebText corpus (~54 GB) and tokenizes it with the GPT-2 BPE tokenizer. Output: `data/openwebtext/train.bin` (~17 GB, ~9B tokens) and `val.bin`.
+## Training
+All configurations are in `config/`. No CLI overrides — all hyperparameters must be set in the config file.
+### Single GPU
+```bash
+python train.py config/train_reflow_1.py
+```
+### Multi-GPU (DDP)
+```bash
+torchrun --standalone --nproc_per_node=4 train.py config/train_reflow_1.py
+```
+### Available Training Configs
+| Config | Architecture | Layers | Params | Notes |
+|--------|-------------|--------|--------|-------|
+| `train_gpt2.py` | GPT-2 | 36 | 505.62M | Standard baseline |
+| `train_gpt2_new.py` | GPT-2-New | 36 | 514.01M | + RoPE, SwiGLU, RMSNorm |
+| `train_reflow_1.py` | reFlow | 32 | 463.67M | Base reFlow, constant lr |
+| `train_reflow_1_big.py` | reFlow | 36 | 515.06M | lr decay, for interpretability |
+| `train_reflow_1_topk_big.py` | reFlow-TopK | 36 | 515.06M | + ReLU + Top-64 sparsity |
+| `train_reflow_1_lite.py` | reFlow-Lite | 32 | 413.34M | + GQA, reduced MLP |
+| `train_reflow_1_small.py` | reFlow | 6 | 46.47M | Small-scale validation |
+### Resume Training
+Append `_resume` to the config name (e.g., `train_reflow_1_big_resume.py`).
+## Text Generation
+```bash
+python sample.py config/sample_reflow_1.py
+```
+Edit the config file to change the prompt, temperature, top-k, etc.
+## Interpretability Experiments
+The experiment suite runs 12 analyses on a trained reFlow model. Both Chinese and English versions are available:
+```bash
+python experiment_en.py config/train_reflow_1_big.py   # English
+python experiment.py config/train_reflow_1_big.py      # Chinese
+```
+An interactive menu will appear:
+| # | Experiment | Group |
+|---|-----------|-------|
+| 1 | Recipe Atlas — recipe-space nearest neighbors | A. Signal Identity |
+| 2 | Sparsity Profile — activation sparsity analysis | A. Signal Identity |
+| 3 | Basis Geometry — singular value & effective rank | A. Signal Identity |
+| 4 | Semantic Galaxy — PCA clustering visualization | B. Semantic Properties |
+| 5 | Semantic Algebra — vector arithmetic (king − man + woman = queen) | B. Semantic Properties |
+| 6 | Typo Resilience — robustness to spelling errors | B. Semantic Properties |
+| 7 | Layer Evolution — per-layer probability crystallization | C. Mechanistic Analysis |
+| 8 | Signal Flow — signal activation heatmaps across layers | C. Mechanistic Analysis |
+| 9 | Causal Ablation — progressive signal knockout curves | C. Mechanistic Analysis |
+| 10 | Emotion Surgery — sentiment steering via signal injection | D. Control & Steering |
+| 11 | Concept Inception — binary-search concept implantation | D. Control & Steering |
+| 12 | Genetic Hijack — global recipe matrix manipulation | D. Control & Steering |
+Enter `all` to run all experiments, or specific numbers (e.g., `1 3 5`). Reports are saved to `out/<model>/audit_reports/`.
+## Checkpoint Inspection
+```bash
+python check.py config/train_reflow_1.py out/reflow-1/ckpt.pt
+```
+## License
+MIT License. Based on [nanoGPT](https://github.com/karpathy/nanoGPT) by Andrej Karpathy.

README_CN.md ADDED Viewed

	@@ -0,0 +1,199 @@

+# reFlow
+**A Metal Soul In My Hand** — 具备原生可解释性的特征解耦 Transformer 架构。
+reFlow 将传统全秩词嵌入矩阵重构为**配方矩阵** $W_{recipe} \in \mathbb{R}^{V \times S}$ 与**信号基底矩阵** $W_{basis} \in \mathbb{R}^{S \times d}$ 的乘积形式，迫使模型在潜空间中维护一组连续、低冗余的信号基底。动态词表矩阵 $W_{vocab} = W_{recipe} \times W_{basis}$ 在每次前向传播中实时重构，同时作为嵌入矩阵与输出投影矩阵使用。
+> **论文**: [English (PDF)](./paper/paper.pdf) | [中文 (PDF)](./paper/paper-cn.pdf)
+## 项目结构
+```
+reFlow/
+├── train.py              # 训练脚本（单卡 / DDP 多卡）
+├── sample.py             # 从训练好的模型生成文本
+├── experiment.py          # 12 项可解释性实验套件（中文）
+├── experiment_en.py       # 12 项可解释性实验套件（English）
+├── check.py              # Checkpoint 参数检查工具
+├── bench.py              # 性能基准测试
+├── models/
+│   ├── gpt2.py           # 标准 GPT-2 基线
+│   ├── gpt2-new.py       # 现代化 GPT-2（RoPE + SwiGLU + RMSNorm）
+│   ├── reflow.py         # reFlow 基础架构
+│   ├── reflow-topk.py    # reFlow + ReLU + Top-K 硬稀疏变体
+│   └── reflow-lite.py    # reFlow + GQA + 缩减 MLP 轻量变体
+├── config/               # 训练 / 采样 / 评估配置文件
+├── data/
+│   ├── openwebtext/      # OpenWebText 数据集预处理
+│   └── sft-lima/         # LIMA SFT 数据集预处理
+└── out/                  # Checkpoints 与实验报告输出
+```
+## 安装
+### 环境要求
+- Python 3.10+
+- 支持 CUDA 的 GPU（实验环境：Tesla T4 x4）
+### 1. 安装 PyTorch（CUDA 12.8）
+```bash
+pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
+```
+> 请根据你的 CUDA 驱动版本调整 URL 中的版本号。详见 [PyTorch 官方安装指南](https://pytorch.org/get-started/locally/)。
+### 2. 核心依赖
+```bash
+pip install datasets tiktoken wandb tqdm
+```
+### 3. 可解释性实验依赖
+运行 `experiment.py` 需要额外安装以下包：
+```bash
+pip install numpy matplotlib seaborn scikit-learn scipy adjustText
+```
+### 一键安装（全部依赖）
+```bash
+pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
+pip install datasets tiktoken wandb tqdm numpy matplotlib seaborn scikit-learn scipy adjustText
+```
+## 数据准备
+### OpenWebText
+```bash
+python data/openwebtext/prepare.py
+```
+该脚本会下载 OpenWebText 语料库（约 54 GB），使用 GPT-2 BPE 分词器进行编码。输出：`data/openwebtext/train.bin`（约 17 GB，约 90 亿 tokens）和 `val.bin`。
+## 训练
+所有超参数均在 `config/` 目录下的配置文件中指定，不支持命令行覆盖。
+### 单卡训练
+```bash
+python train.py config/train_reflow_1.py
+```
+### 多卡训练（DDP）
+```bash
+torchrun --standalone --nproc_per_node=4 train.py config/train_reflow_1.py
+```
+### 可用训练配置
+| 配置文件 | 架构 | 层数 | 参数量 | 说明 |
+|---------|------|------|--------|------|
+| `train_gpt2.py` | GPT-2 | 36 | 505.62M | 标准基线 |
+| `train_gpt2_new.py` | GPT-2-New | 36 | 514.01M | + RoPE、SwiGLU、RMSNorm |
+| `train_reflow_1.py` | reFlow | 32 | 463.67M | 基础 reFlow，恒定学习率 |
+| `train_reflow_1_big.py` | reFlow | 36 | 515.06M | 学习率衰减，用于可解释性实验 |
+| `train_reflow_1_topk_big.py` | reFlow-TopK | 36 | 515.06M | + ReLU + Top-64 稀疏化 |
+| `train_reflow_1_lite.py` | reFlow-Lite | 32 | 413.34M | + GQA，缩减 MLP |
+| `train_reflow_1_small.py` | reFlow | 6 | 46.47M | 小规模验证 |
+### 断点续训
+使用对应的 `_resume` 配置文件（如 `train_reflow_1_big_resume.py`）。
+## 文本生成
+```bash
+python sample.py config/sample_reflow_1.py
+```
+在配置文件中修改 prompt、temperature、top_k 等参数。
+## 可解释性实验
+实验套件对训练好的 reFlow 模型执行 12 项分析，提供中英文两个版本：
+```bash
+python experiment.py config/train_reflow_1_big.py      # 中文版
+python experiment_en.py config/train_reflow_1_big.py   # English
+```
+运行后将出现交互式菜单：
+| 编号 | 实验名称 | 分组 |
+|------|---------|------|
+| 1 | 配方空间图谱 — 配方近邻与聚类热力图 | A. 信号本体 |
+| 2 | 信号稀疏性分析 — 激活稀疏率统计 | A. 信号本体 |
+| 3 | 信号基底几何 — 奇异值分解与有效秩 | A. 信号本体 |
+| 4 | 语义星空图 — PCA 聚类可视化 | B. 语义性质 |
+| 5 | 语义代数运算 — 向量算术（king − man + woman = queen） | B. 语义性质 |
+| 6 | 拼写鲁棒性 — 对拼写错误的容忍度 | B. 语义性质 |
+| 7 | 层级概率演化 — 逐层预测概率结晶过程 | C. 机械分析 |
+| 8 | 信号流追踪 — 信号激活热力图 | C. 机械分析 |
+| 9 | 因果消融曲线 — 逐信号消融概率变化 | C. 机械分析 |
+| 10 | 情绪手术 — 信号注入实现情感翻转 | D. 操控验证 |
+| 11 | 概念注入 — 二分搜索概念植入 | D. 操控验证 |
+| 12 | 基因库篡改 — 全局配方矩阵操控 | D. 操控验证 |
+输入 `all` 运行全部实验，或输入编号（如 `1 3 5`）选择性运行。实验报告保存至 `out/<模型名>/audit_reports/`。
+## Checkpoint 检查
+```bash
+python check.py config/train_reflow_1.py out/reflow-1/ckpt.pt
+```
+## 复现论文实验
+### 第 4 章：收敛实验
+依次训练各模型变体并对比训练曲线：
+```bash
+# GPT-2 基线
+torchrun --standalone --nproc_per_node=4 train.py config/train_gpt2.py
+# GPT-2-New（现代化组件）
+torchrun --standalone --nproc_per_node=4 train.py config/train_gpt2_new.py
+# reFlow 基础版
+torchrun --standalone --nproc_per_node=4 train.py config/train_reflow_1.py
+# reFlow-Big（用于第 5-6 章实验）
+torchrun --standalone --nproc_per_node=4 train.py config/train_reflow_1_big.py
+# reFlow-TopK-Big（硬稀疏变体）
+torchrun --standalone --nproc_per_node=4 train.py config/train_reflow_1_topk_big.py
+# reFlow-Lite（轻量版）
+torchrun --standalone --nproc_per_node=4 train.py config/train_reflow_1_lite.py
+# reFlow-Small（小规模验证）
+torchrun --standalone --nproc_per_node=4 train.py config/train_reflow_1_small.py
+```
+### 第 5 章：可解释性实验（reFlow-1-Big）
+```bash
+python experiment.py config/train_reflow_1_big.py
+# 输入 all 运行全部 12 项实验
+```
+### 第 6 章：硬稀疏对比实验（reFlow-1-TopK-Big）
+```bash
+python experiment.py config/train_reflow_1_topk_big.py
+# 输入 all 运行全部 12 项实验
+```
+对比两组实验报告即可复现论文第 6 章的对比表格。
+## 许可证
+MIT License。基于 [nanoGPT](https://github.com/karpathy/nanoGPT)（Andrej Karpathy）二次开发。

bench.py ADDED Viewed

	@@ -0,0 +1,182 @@

+"""
+Benchmark script for model performance testing
+REQUIRED:
+  1. You must specify a config file from the config/ directory
+  2. All configuration must be in the config file. No CLI overrides allowed
+Usage:
+  python bench.py <config_file>
+Example:
+  python bench.py config/bench_gpt2.py
+"""
+import sys
+import os
+# -----------------------------------------------------------------------------
+# Configuration loading (BEFORE imports to validate config first)
+# -----------------------------------------------------------------------------
+if len(sys.argv) != 2:
+    print("ERROR: Invalid arguments!")
+    print("Usage: python bench.py <config_file>")
+    print("Available configs in config/:")
+    print("  - bench_gpt2.py")
+    sys.exit(1)
+config_file = sys.argv[1]
+# Disallow --key=value arguments
+for arg in sys.argv[1:]:
+    if arg.startswith('--'):
+        print(f"ERROR: CLI overrides are not supported. All config must be in file: {config_file}")
+        sys.exit(1)
+# Load config
+print(f"Loading config from: {config_file}")
+exec(open(config_file).read())
+# Validate required config keys
+required_keys = ['model_config']
+missing_keys = [k for k in required_keys if k not in globals()]
+if missing_keys:
+    print(f"ERROR: Missing required config keys: {missing_keys}")
+    sys.exit(1)
+# Load model configuration
+model_config = globals()['model_config']
+model_file = f"models/{model_config}.py"
+try:
+    exec(open(model_file).read())
+except FileNotFoundError:
+    print(f"ERROR: Model file not found: {model_file}")
+    sys.exit(1)
+# Get model-specific required config keys from GPTConfig
+model_required_keys = []
+if 'GPTConfig' in globals():
+    config_class = globals()['GPTConfig']
+    import dataclasses
+    for field in dataclasses.fields(config_class):
+        model_required_keys.append(field.name)
+# Validate model-specific config keys
+if init_from == 'scratch':
+    missing_model_keys = [k for k in model_required_keys if k not in globals()]
+    if missing_model_keys:
+        print(f"ERROR: Missing required model config keys for {model_config}: {missing_model_keys}")
+        sys.exit(1)
+# Print configuration
+print("\n" + "=" * 60)
+print("BENCH CONFIGURATION")
+print("=" * 60)
+for key in sorted(globals().keys()):
+    val = globals().get(key)
+    if isinstance(val, (int, float, bool, str)) and not key.startswith('_'):
+        print(f"  {key:30s} = {val}")
+print("=" * 60 + "\n")
+# Now import dependencies
+import os
+from contextlib import nullcontext
+import numpy as np
+import time
+import torch
+# Import GPTConfig and GPT
+GPTConfig = globals()['GPTConfig']
+GPT = globals()['GPT']
+# Auto-detect dtype
+if dtype == 'bfloat16' and not (torch.cuda.is_available() and torch.cuda.is_bf16_supported()):
+    dtype = 'float16'
+torch.manual_seed(seed)
+torch.cuda.manual_seed(seed)
+torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cudnn.allow_tf32 = True
+device_type = 'cuda' if 'cuda' in device else 'cpu'
+ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
+ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype)
+# data loading
+if real_data:
+    dataset = globals().get('dataset', 'openwebtext')
+    data_dir = os.path.join('data', dataset)
+    train_data = np.memmap(os.path.join(data_dir, 'train.bin'), dtype=np.uint16, mode='r')
+    def get_batch(split):
+        data = train_data
+        ix = torch.randint(len(data) - block_size, (batch_size,))
+        x = torch.stack([torch.from_numpy((data[i:i+block_size]).astype(np.int64)) for i in ix])
+        y = torch.stack([torch.from_numpy((data[i+1:i+1+block_size]).astype(np.int64)) for i in ix])
+        x, y = x.pin_memory().to(device, non_blocking=True), y.pin_memory().to(device, non_blocking=True)
+        return x, y
+else:
+    x = torch.randint(50304, (batch_size, block_size), device=device)
+    y = torch.randint(50304, (batch_size, block_size), device=device)
+    get_batch = lambda split: (x, y)
+# model init
+gptconf = GPTConfig(
+    block_size=block_size,
+    n_layer=n_layer,
+    n_head=n_head,
+    n_embd=n_embd,
+    dropout=0,
+    bias=bias,
+)
+model = GPT(gptconf)
+model.to(device)
+optimizer = model.configure_optimizers(weight_decay=1e-2, learning_rate=1e-4, betas=(0.9, 0.95), device_type=device_type)
+if compile:
+    print("Compiling model...")
+    model = torch.compile(model)
+if profile:
+    wait, warmup, active = 5, 5, 5
+    num_steps = wait + warmup + active
+    with torch.profiler.profile(
+        activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],
+        schedule=torch.profiler.schedule(wait=wait, warmup=warmup, active=active, repeat=1),
+        on_trace_ready=torch.profiler.tensorboard_trace_handler('./bench_log'),
+        record_shapes=False,
+        profile_memory=False,
+        with_stack=False,
+        with_flops=True,
+        with_modules=False,
+    ) as prof:
+        X, Y = get_batch('train')
+        for k in range(num_steps):
+            with ctx:
+                logits, loss = model(X, Y)
+            X, Y = get_batch('train')
+            optimizer.zero_grad(set_to_none=True)
+            loss.backward()
+            optimizer.step()
+            lossf = loss.item()
+            print(f"{k}/{num_steps} loss: {lossf:.4f}")
+            prof.step()
+else:
+    # simple benchmarking
+    torch.cuda.synchronize()
+    for stage, num_steps in enumerate([10, 20]):
+        t0 = time.time()
+        X, Y = get_batch('train')
+        for k in range(num_steps):
+            with ctx:
+                logits, loss = model(X, Y)
+            X, Y = get_batch('train')
+            optimizer.zero_grad(set_to_none=True)
+            loss.backward()
+            optimizer.step()
+            lossf = loss.item()
+            print(f"{k}/{num_steps} loss: {lossf:.4f}")
+        torch.cuda.synchronize()
+        t1 = time.time()
+        dt = t1 - t0
+        mfu = model.estimate_mfu(batch_size * 1 * num_steps, dt)
+        if stage == 1:
+            print(f"time per iteration: {dt/num_steps*1000:.4f}ms, MFU: {mfu*100:.2f}%")

check.py ADDED Viewed

	@@ -0,0 +1,122 @@

+"""
+Check model parameters from a checkpoint
+Usage:
+  python check.py <config_file> <checkpoint_file>
+Example:
+  python check.py config/train_reflow_web.py out-web/ckpt.pt
+"""
+import sys
+import os
+import torch
+# -----------------------------------------------------------------------------
+# Configuration loading
+# -----------------------------------------------------------------------------
+if len(sys.argv) != 3:
+    print("ERROR: Invalid arguments!")
+    print("Usage: python check.py <config_file> <checkpoint_file>")
+    print("Example: python check.py config/train_reflow_web.py out-web/ckpt.pt")
+    sys.exit(1)
+config_file = sys.argv[1]
+checkpoint_file = sys.argv[2]
+if not os.path.exists(config_file):
+    print(f"ERROR: Config file not found: {config_file}")
+    sys.exit(1)
+if not os.path.exists(checkpoint_file):
+    print(f"ERROR: Checkpoint file not found: {checkpoint_file}")
+    sys.exit(1)
+# Load config
+print(f"Loading config from: {config_file}")
+exec(open(config_file).read())
+# Load model configuration
+model_config = globals().get('model_config')
+if not model_config:
+    print("ERROR: 'model_config' is required in config file")
+    sys.exit(1)
+model_file = f"models/{model_config}.py"
+try:
+    exec(open(model_file).read())
+except FileNotFoundError:
+    print(f"ERROR: Model file not found: {model_file}")
+    sys.exit(1)
+# Import GPTConfig and GPT
+GPTConfig = globals()['GPTConfig']
+GPT = globals()['GPT']
+# Load checkpoint
+print(f"Loading checkpoint from: {checkpoint_file}")
+checkpoint = torch.load(checkpoint_file, map_location='cpu')
+model_args = checkpoint['model_args']
+# Create model and load weights
+gptconf = GPTConfig(**model_args)
+model = GPT(gptconf)
+state_dict = checkpoint['model']
+# Handle PyTorch 2.0+ compiled model keys
+unwanted_prefix = '_orig_mod.'
+for k in list(state_dict.keys()):
+    if k.startswith(unwanted_prefix):
+        state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
+model.load_state_dict(state_dict)
+# Print model information
+print("\n" + "=" * 60)
+print("MODEL INFORMATION")
+print("=" * 60)
+print(f"\nModel Architecture: {model_config}")
+print(f"Checkpoint: {checkpoint_file}")
+print(f"\nModel Arguments:")
+for k, v in model_args.items():
+    print(f"  {k:20s} = {v}")
+print(f"\nTotal Parameters: {model.get_num_params()/1e6:.2f}M")
+# Count parameters by component
+if hasattr(model, 'transformer'):
+    print("\nParameters by component:")
+    # Show wte (embedding) - for Reflow includes vocab_to_signals + signal_basis
+    if hasattr(model.transformer, 'wte'):
+        wte = model.transformer.wte
+        if hasattr(wte, 'vocab_to_signals'):
+            vocab_to_signals_params = wte.vocab_to_signals.weight.numel()
+            print(f"  transformer.wte.vocab_to_signals:       {vocab_to_signals_params/1e6:>10.2f}M")
+        if hasattr(wte, 'signal_basis'):
+            signal_basis_params = wte.signal_basis.numel()
+            print(f"  transformer.wte.signal_basis:          {signal_basis_params/1e6:>10.2f}M")
+        wte_params = sum(p.numel() for p in wte.parameters())
+        print(f"  transformer.wte (total):                 {wte_params/1e6:>10.2f}M")
+    # Count transformer.h (layers)
+    if hasattr(model.transformer, 'h'):
+        h_params = sum(p.numel() for p in model.transformer.h.parameters())
+        print(f"  transformer.h (all layers):             {h_params/1e6:>10.2f}M")
+    # Show ln_f
+    if hasattr(model.transformer, 'ln_f'):
+        ln_f_params = sum(p.numel() for p in model.transformer.ln_f.parameters())
+        print(f"  transformer.ln_f:                       {ln_f_params/1e6:>10.2f}M")
+print(f"\nTotal trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad)/1e6:.2f}M")
+print(f"Total non-trainable parameters: {sum(p.numel() for p in model.parameters() if not p.requires_grad)/1e6:.2f}M")
+# Training info if available
+if 'iter_num' in checkpoint:
+    print(f"\nTraining Info:")
+    print(f"  iter_num: {checkpoint['iter_num']}")
+    print(f"  best_val_loss: {checkpoint.get('best_val_loss', 'N/A')}")
+print("=" * 60)

config/base.py ADDED Viewed

	@@ -0,0 +1,41 @@

+out_dir = 'out'
+eval_interval = 2000
+log_interval = 1
+eval_iters = 200
+eval_only = False
+always_save_checkpoint = True
+init_from = 'scratch'
+wandb_log = False
+wandb_project = 'owt'
+wandb_run_name = 'gpt2'
+dataset = 'openwebtext'
+gradient_accumulation_steps = 5 * 8
+batch_size = 12
+block_size = 1024
+n_layer = 12
+n_head = 12
+n_embd = 768
+dropout = 0.0
+bias = False
+learning_rate = 6e-4
+max_iters = 600000
+weight_decay = 1e-1
+beta1 = 0.9
+beta2 = 0.95
+grad_clip = 1.0
+decay_lr = True
+warmup_iters = 2000
+lr_decay_iters = 600000
+min_lr = 6e-5
+backend = 'nccl'
+device = 'cuda'
+dtype = 'bfloat16'
+compile = True

config/bench_base.py ADDED Viewed

	@@ -0,0 +1,14 @@

+batch_size = 12
+block_size = 1024
+bias = False
+real_data = True
+seed = 1337
+device = 'cuda'
+dtype = 'bfloat16'
+compile = True
+profile = False
+n_layer = 12
+n_head = 12
+n_embd = 768

config/bench_gpt2.py ADDED Viewed

	@@ -0,0 +1,18 @@

+model_config = 'gpt2'
+batch_size = 12
+block_size = 1024
+bias = False
+real_data = True
+seed = 1337
+device = 'cuda'
+dtype = 'bfloat16'
+compile = True
+profile = False
+n_layer = 12
+n_head = 12
+n_embd = 768
+init_from = 'scratch'

config/eval_gpt2.py ADDED Viewed

	@@ -0,0 +1,5 @@

+batch_size = 8
+eval_iters = 500
+eval_only = True
+wandb_log = False
+init_from = 'gpt2'

config/eval_gpt2_large.py ADDED Viewed

	@@ -0,0 +1,5 @@

+batch_size = 8
+eval_iters = 500
+eval_only = True
+wandb_log = False
+init_from = 'gpt2-large'

config/eval_gpt2_medium.py ADDED Viewed

	@@ -0,0 +1,5 @@

+batch_size = 8
+eval_iters = 500
+eval_only = True
+wandb_log = False
+init_from = 'gpt2-medium'

config/eval_gpt2_xl.py ADDED Viewed

	@@ -0,0 +1,5 @@

+batch_size = 8
+eval_iters = 500
+eval_only = True
+wandb_log = False
+init_from = 'gpt2-xl'

config/finetune_shakespeare.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import time
+model_config = 'gpt2'
+out_dir = 'out-shakespeare'
+eval_interval = 5
+log_interval = 1
+eval_iters = 40
+eval_only = False
+always_save_checkpoint = False
+init_from = 'gpt2-xl'
+wandb_log = False
+wandb_project = 'shakespeare'
+wandb_run_name = 'ft-' + str(time.time())
+dataset = 'shakespeare'
+gradient_accumulation_steps = 32
+batch_size = 1
+block_size = 1024
+n_layer = None
+n_head = None
+n_embd = None
+dropout = 0.1
+bias = False
+learning_rate = 3e-5
+max_iters = 20
+weight_decay = 1e-1
+beta1 = 0.9
+beta2 = 0.95
+grad_clip = 1.0
+decay_lr = False
+warmup_iters = 0
+lr_decay_iters = 20
+min_lr = 3e-5
+backend = 'nccl'
+device = 'cuda'
+dtype = 'bfloat16'
+compile = True

config/sample_base.py ADDED Viewed

	@@ -0,0 +1,12 @@

+init_from = 'resume'
+out_dir = 'out'
+start = "\n"
+num_samples = 10
+max_new_tokens = 500
+temperature = 0.8
+top_k = 200
+seed = 1337
+device = 'cuda'
+dtype = 'bfloat16'
+compile = False

config/sample_gpt2.py ADDED Viewed

	@@ -0,0 +1,17 @@

+model_config = 'gpt2'
+out_dir = 'out/gpt2'
+init_from = 'resume'
+start = "The sun is bright, the night is dark, fire is hot, and ice is"
+num_samples = 10
+max_new_tokens = 500
+temperature = 0.8
+top_k = 200
+seed = 1337
+device = 'cuda'
+dtype = 'bfloat16'
+compile = False

config/sample_gpt2_new.py ADDED Viewed

	@@ -0,0 +1,17 @@

+model_config = 'gpt2-new'
+out_dir = 'out/gpt2-new'
+init_from = 'resume'
+start = "Q: What is the capital of China?\nA: Beijing\nQ: What is the capital of Japan?\nA: Tokyo\nQ: What is the capital of France?\nA:"
+num_samples = 10
+max_new_tokens = 50
+temperature = 0.01
+top_k = 20
+seed = 1337
+device = 'cuda'
+dtype = 'float16'
+compile = False

config/sample_gpt2_new_nolr.py ADDED Viewed

	@@ -0,0 +1,17 @@

+model_config = 'gpt2-new'
+out_dir = 'out/gpt2-new-nolr'
+init_from = 'resume'
+start = "The sun is bright, the night is dark, fire is hot, and ice is"
+num_samples = 10
+max_new_tokens = 500
+temperature = 0.8
+top_k = 200
+seed = 1337
+device = 'cuda'
+dtype = 'bfloat16'
+compile = False

config/sample_reflow_1.py ADDED Viewed

	@@ -0,0 +1,17 @@

+model_config = 'reflow'
+out_dir = 'out/reflow-1'
+init_from = 'resume'
+start = "Q: What is the capital of China?\nA: Beijing\nQ: What is the capital of Japan?\nA: Tokyo\nQ: What is the capital of France?\nA:"
+num_samples = 10
+max_new_tokens = 50
+temperature = 0.01
+top_k = 20
+seed = 1337
+device = 'cuda'
+dtype = 'float16'
+compile = False

config/sample_reflow_1_big.py ADDED Viewed

	@@ -0,0 +1,17 @@

+model_config = 'reflow'
+out_dir = 'out/reflow-1-big'
+init_from = 'resume'
+start = "The sun is bright, the night is dark, fire is hot, and ice is"
+num_samples = 10
+max_new_tokens = 500
+temperature = 0.8
+top_k = 200
+seed = 1337
+device = 'cuda'
+dtype = 'float16'
+compile = False

config/sample_reflow_1_lite.py ADDED Viewed

	@@ -0,0 +1,17 @@

+model_config = 'reflow-lite'
+out_dir = 'out/reflow-1-lite'
+init_from = 'resume'
+start = "The sun is bright, the night is dark, fire is hot, and ice is"
+num_samples = 10
+max_new_tokens = 500
+temperature = 0.8
+top_k = 200
+seed = 1337
+device = 'cuda'
+dtype = 'float16'
+compile = False

config/sample_reflow_1_small.py ADDED Viewed

	@@ -0,0 +1,17 @@

+model_config = 'reflow'
+out_dir = 'out/reflow-1-small'
+init_from = 'resume'
+start = "The sun is bright, the night is dark, fire is hot, and ice is"
+num_samples = 10
+max_new_tokens = 500
+temperature = 0.8
+top_k = 200
+seed = 1337
+device = 'cuda'
+dtype = 'float16'
+compile = False

config/sample_reflow_1_small_sp.py ADDED Viewed

	@@ -0,0 +1,17 @@

+model_config = 'reflow'
+out_dir = 'out/reflow-1-small-sp'
+init_from = 'resume'
+start = "The sun is bright, the night is dark, fire is hot, and ice is"
+num_samples = 10
+max_new_tokens = 500
+temperature = 0.8
+top_k = 200
+seed = 1337
+device = 'cuda'
+dtype = 'float16'
+compile = False

config/sample_reflow_1_topk.py ADDED Viewed

	@@ -0,0 +1,17 @@

+model_config = 'reflow-topk'
+out_dir = 'out/reflow-1-topk'
+init_from = 'resume'
+start = "Q: What is the capital of China?\nA: Beijing\nQ: What is the capital of Japan?\nA: Tokyo\nQ: What is the capital of France?\nA:"
+num_samples = 10
+max_new_tokens = 50
+temperature = 0.01
+top_k = 20
+seed = 1337
+device = 'cuda'
+dtype = 'float16'
+compile = False

config/sample_reflow_1_topk_big.py ADDED Viewed

	@@ -0,0 +1,17 @@

+model_config = 'reflow-topk'
+out_dir = 'out/reflow-1-topk-big'
+init_from = 'resume'
+start = "Q: What is the capital of China?\nA: Beijing\nQ: What is the capital of Japan?\nA: Tokyo\nQ: What is the capital of France?\nA:"
+num_samples = 10
+max_new_tokens = 50
+temperature = 0.01
+top_k = 20
+seed = 1337
+device = 'cuda'
+dtype = 'float16'
+compile = False

config/sample_sft_reflow_1.py ADDED Viewed

	@@ -0,0 +1,17 @@

+model_config = 'reflow'
+out_dir = 'out/sft-lima-reflow-1'
+init_from = 'resume'
+start = "Question: Which city is the capital of France?\nAnswer: "
+num_samples = 10
+max_new_tokens = 500
+temperature = 0.1
+top_k = 20
+seed = 1337
+device = 'cuda'
+dtype = 'float16'
+compile = False

config/train_gpt2.py ADDED Viewed

	@@ -0,0 +1,45 @@

+model_config = 'gpt2'
+log_file = 'logs/gpt2.log'
+out_dir = 'out/gpt2'
+eval_interval = 500
+log_interval = 1
+eval_iters = 500
+eval_only = False
+always_save_checkpoint = False
+init_from = 'scratch'
+wandb_log = False
+wandb_project = 'owt'
+wandb_run_name = 'gpt2'
+dataset = 'openwebtext'
+gradient_accumulation_steps = 64
+batch_size = 1
+block_size = 1024
+n_layer = 36
+n_head = 16
+n_embd = 1024
+vocab_size = 50304
+dropout = 0.0
+bias = False
+learning_rate = 3e-4
+max_iters = 50000
+weight_decay = 1e-1
+beta1 = 0.9
+beta2 = 0.95
+grad_clip = 1.0
+decay_lr = True
+warmup_iters = 1000
+lr_decay_iters = 50000
+min_lr = 3e-5
+backend = 'nccl'
+device = 'cuda'
+dtype = 'float16'
+compile = True

config/train_gpt2_new.py ADDED Viewed

	@@ -0,0 +1,45 @@

+model_config = 'gpt2-new'
+log_file = 'logs/gpt2-new.log'
+out_dir = 'out/gpt2-new'
+eval_interval = 500
+log_interval = 1
+eval_iters = 500
+eval_only = False
+always_save_checkpoint = False
+init_from = 'scratch'
+wandb_log = False
+wandb_project = 'owt'
+wandb_run_name = 'gpt2'
+dataset = 'openwebtext'
+gradient_accumulation_steps = 64
+batch_size = 1
+block_size = 1024
+n_layer = 36
+n_head = 16
+n_embd = 1024
+vocab_size = 50304
+dropout = 0.0
+bias = False
+learning_rate = 3e-4
+max_iters = 50000
+weight_decay = 1e-1
+beta1 = 0.9
+beta2 = 0.95
+grad_clip = 1.0
+decay_lr = True
+warmup_iters = 1000
+lr_decay_iters = 50000
+min_lr = 3e-5
+backend = 'nccl'
+device = 'cuda'
+dtype = 'float16'
+compile = True

config/train_gpt2_new_nolr.py ADDED Viewed

	@@ -0,0 +1,45 @@

+model_config = 'gpt2-new'
+log_file = 'logs/gpt2-new-nolr.log'
+out_dir = 'out/gpt2-new-nolr'
+eval_interval = 500
+log_interval = 1
+eval_iters = 500
+eval_only = False
+always_save_checkpoint = False
+init_from = 'scratch'
+wandb_log = False
+wandb_project = 'owt'
+wandb_run_name = 'gpt2'
+dataset = 'openwebtext'
+gradient_accumulation_steps = 64
+batch_size = 1
+block_size = 1024
+n_layer = 36
+n_head = 16
+n_embd = 1024
+vocab_size = 50304
+dropout = 0.0
+bias = False
+learning_rate = 1.5e-4
+max_iters = 50000
+weight_decay = 0.1
+beta1 = 0.9
+beta2 = 0.95
+grad_clip = 1.0
+decay_lr = False
+warmup_iters = 2000
+lr_decay_iters = 600000
+min_lr = 6e-5
+backend = 'nccl'
+device = 'cuda'
+dtype = 'float16'
+compile = True

config/train_gpt2_new_nolr_resume.py ADDED Viewed

	@@ -0,0 +1,45 @@

+model_config = 'gpt2-new'
+log_file = 'logs/gpt2-new-nolr.log'
+out_dir = 'out/gpt2-new-nolr'
+eval_interval = 500
+log_interval = 1
+eval_iters = 500
+eval_only = False
+always_save_checkpoint = False
+init_from = 'resume'
+wandb_log = False
+wandb_project = 'owt'
+wandb_run_name = 'gpt2'
+dataset = 'openwebtext'
+gradient_accumulation_steps = 64
+batch_size = 1
+block_size = 1024
+n_layer = 36
+n_head = 16
+n_embd = 1024
+vocab_size = 50304
+dropout = 0.0
+bias = False
+learning_rate = 1.5e-4
+max_iters = 50000
+weight_decay = 0.1
+beta1 = 0.9
+beta2 = 0.95
+grad_clip = 1.0
+decay_lr = False
+warmup_iters = 2000
+lr_decay_iters = 600000
+min_lr = 6e-5
+backend = 'nccl'
+device = 'cuda'
+dtype = 'float16'
+compile = True

config/train_gpt2_new_resume.py ADDED Viewed

	@@ -0,0 +1,45 @@

+model_config = 'gpt2-new'
+log_file = 'logs/gpt2-new.log'
+out_dir = 'out/gpt2-new'
+eval_interval = 500
+log_interval = 1
+eval_iters = 500
+eval_only = False
+always_save_checkpoint = False
+init_from = 'resume'
+wandb_log = False
+wandb_project = 'owt'
+wandb_run_name = 'gpt2'
+dataset = 'openwebtext'
+gradient_accumulation_steps = 64
+batch_size = 1
+block_size = 1024
+n_layer = 36
+n_head = 16
+n_embd = 1024
+vocab_size = 50304
+dropout = 0.0
+bias = False
+learning_rate = 3e-4
+max_iters = 50000
+weight_decay = 1e-1
+beta1 = 0.9
+beta2 = 0.95
+grad_clip = 1.0
+decay_lr = True
+warmup_iters = 1000
+lr_decay_iters = 50000
+min_lr = 3e-5
+backend = 'nccl'
+device = 'cuda'
+dtype = 'float16'
+compile = True

config/train_gpt2_resume.py ADDED Viewed

	@@ -0,0 +1,45 @@

+model_config = 'gpt2'
+log_file = 'logs/gpt2.log'
+out_dir = 'out/gpt2'
+eval_interval = 500
+log_interval = 1
+eval_iters = 500
+eval_only = False
+always_save_checkpoint = False
+init_from = 'resume'
+wandb_log = False
+wandb_project = 'owt'
+wandb_run_name = 'gpt2'
+dataset = 'openwebtext'
+gradient_accumulation_steps = 64
+batch_size = 1
+block_size = 1024
+n_layer = 36
+n_head = 16
+n_embd = 1024
+vocab_size = 50304
+dropout = 0.0
+bias = False
+learning_rate = 3e-4
+max_iters = 50000
+weight_decay = 1e-1
+beta1 = 0.9
+beta2 = 0.95
+grad_clip = 1.0
+decay_lr = True
+warmup_iters = 1000
+lr_decay_iters = 50000
+min_lr = 3e-5
+backend = 'nccl'
+device = 'cuda'
+dtype = 'float16'
+compile = True

config/train_reflow_1.py ADDED Viewed

	@@ -0,0 +1,46 @@

+model_config = 'reflow'
+log_file = 'logs/reflow-1.log'
+out_dir = 'out/reflow-1'
+eval_interval = 500
+log_interval = 1
+eval_iters = 500
+eval_only = False
+always_save_checkpoint = False
+init_from = 'scratch'
+wandb_log = False
+wandb_project = 'reflow-1'
+wandb_run_name = 'reflow-1'
+dataset = 'openwebtext'
+gradient_accumulation_steps = 64
+batch_size = 1
+block_size = 1024
+n_layer = 32
+n_head = 16
+n_embd = 1024
+n_signals = 1024
+vocab_size = 50304
+dropout = 0.0
+bias = False
+learning_rate = 1.5e-4
+max_iters = 50000
+weight_decay = 0.1
+beta1 = 0.9
+beta2 = 0.95
+grad_clip = 1.0
+decay_lr = False
+warmup_iters = 2000
+lr_decay_iters = 600000
+min_lr = 6e-5
+backend = 'nccl'
+device = 'cuda'
+dtype = 'float16'
+compile = True

config/train_reflow_1_big.py ADDED Viewed

	@@ -0,0 +1,46 @@

+model_config = 'reflow'
+log_file = 'logs/reflow-1-big.log'
+out_dir = 'out/reflow-1-big'
+eval_interval = 500
+log_interval = 1
+eval_iters = 500
+eval_only = False
+always_save_checkpoint = False
+init_from = 'scratch'
+wandb_log = False
+wandb_project = 'reflow-1-big'
+wandb_run_name = 'reflow-1-big'
+dataset = 'openwebtext'
+gradient_accumulation_steps = 64
+batch_size = 1
+block_size = 1024
+n_layer = 36
+n_head = 16
+n_embd = 1024
+n_signals = 1024
+vocab_size = 50304
+dropout = 0.0
+bias = False
+learning_rate = 3e-4
+max_iters = 50000
+weight_decay = 1e-1
+beta1 = 0.9
+beta2 = 0.95
+grad_clip = 1.0
+decay_lr = True
+warmup_iters = 1000
+lr_decay_iters = 50000
+min_lr = 3e-5
+backend = 'nccl'
+device = 'cuda'
+dtype = 'float16'
+compile = True

config/train_reflow_1_big_resume.py ADDED Viewed

	@@ -0,0 +1,46 @@

+model_config = 'reflow'
+log_file = 'logs/reflow-1-big.log'
+out_dir = 'out/reflow-1-big'
+eval_interval = 500
+log_interval = 1
+eval_iters = 500
+eval_only = False
+always_save_checkpoint = False
+init_from = 'resume'
+wandb_log = False
+wandb_project = 'reflow-1-big'
+wandb_run_name = 'reflow-1-big'
+dataset = 'openwebtext'
+gradient_accumulation_steps = 64
+batch_size = 1
+block_size = 1024
+n_layer = 36
+n_head = 16
+n_embd = 1024
+n_signals = 1024
+vocab_size = 50304
+dropout = 0.0
+bias = False
+learning_rate = 3e-4
+max_iters = 50000
+weight_decay = 1e-1
+beta1 = 0.9
+beta2 = 0.95
+grad_clip = 1.0
+decay_lr = True
+warmup_iters = 1000
+lr_decay_iters = 50000
+min_lr = 3e-5
+backend = 'nccl'
+device = 'cuda'
+dtype = 'float16'
+compile = True

config/train_reflow_1_lite.py ADDED Viewed

	@@ -0,0 +1,49 @@

+model_config = 'reflow-lite'
+log_file = 'logs/reflow-1-lite.log'
+out_dir = 'out/reflow-1-lite'
+eval_interval = 500
+log_interval = 1
+eval_iters = 500
+eval_only = False
+always_save_checkpoint = False
+init_from = 'scratch'
+wandb_log = False
+wandb_project = 'reflow-lite'
+wandb_run_name = 'reflow-lite'
+dataset = 'openwebtext'
+gradient_accumulation_steps = 40
+batch_size = 1
+block_size = 1024
+n_layer = 32
+n_head = 16
+n_embd = 1024
+n_signals = 1024
+vocab_size = 50304
+dropout = 0.0
+bias = False
+mlp_expansion_ratio = 2.66
+n_kv_head = 4
+learning_rate = 1e-4
+max_iters = 100000
+weight_decay = 0.1
+beta1 = 0.9
+beta2 = 0.95
+grad_clip = 1.0
+decay_lr = True
+warmup_iters = 2000
+lr_decay_iters = 100000
+min_lr = 1e-5
+backend = 'nccl'
+device = 'cuda'
+dtype = 'bfloat16'
+compile = True

config/train_reflow_1_lite_resume.py ADDED Viewed

	@@ -0,0 +1,49 @@

+model_config = 'reflow-lite'
+log_file = 'logs/reflow-1-lite.log'
+out_dir = 'out/reflow-1-lite'
+eval_interval = 500
+log_interval = 1
+eval_iters = 500
+eval_only = False
+always_save_checkpoint = False
+init_from = 'resume'
+wandb_log = False
+wandb_project = 'reflow-lite'
+wandb_run_name = 'reflow-lite'
+dataset = 'openwebtext'
+gradient_accumulation_steps = 40
+batch_size = 1
+block_size = 1024
+n_layer = 32
+n_head = 16
+n_embd = 1024
+n_signals = 1024
+vocab_size = 50304
+dropout = 0.0
+bias = False
+mlp_expansion_ratio = 2.66
+n_kv_head = 4
+learning_rate = 1e-4
+max_iters = 100000
+weight_decay = 0.1
+beta1 = 0.9
+beta2 = 0.95
+grad_clip = 1.0
+decay_lr = True
+warmup_iters = 2000
+lr_decay_iters = 100000
+min_lr = 1e-5
+backend = 'nccl'
+device = 'cuda'
+dtype = 'bfloat16'
+compile = True

config/train_reflow_1_resume.py ADDED Viewed

	@@ -0,0 +1,46 @@

+model_config = 'reflow'
+log_file = 'logs/reflow-1.log'
+out_dir = 'out/reflow-1'
+eval_interval = 500
+log_interval = 1
+eval_iters = 500
+eval_only = False
+always_save_checkpoint = False
+init_from = 'resume'
+wandb_log = False
+wandb_project = 'reflow-1'
+wandb_run_name = 'reflow-1'
+dataset = 'openwebtext'
+gradient_accumulation_steps = 64
+batch_size = 1
+block_size = 1024
+n_layer = 32
+n_head = 16
+n_embd = 1024
+n_signals = 1024
+vocab_size = 50304
+dropout = 0.0
+bias = False
+learning_rate = 1.5e-4
+max_iters = 50000
+weight_decay = 0.1
+beta1 = 0.9
+beta2 = 0.95
+grad_clip = 1.0
+decay_lr = False
+warmup_iters = 2000
+lr_decay_iters = 600000
+min_lr = 6e-5
+backend = 'nccl'
+device = 'cuda'
+dtype = 'float16'
+compile = True

config/train_reflow_1_small.py ADDED Viewed

	@@ -0,0 +1,46 @@

+model_config = 'reflow'
+log_file = 'logs/reflow-1-small.log'
+out_dir = 'out/reflow-1-small'
+eval_interval = 500
+log_interval = 1
+eval_iters = 500
+eval_only = False
+always_save_checkpoint = False
+init_from = 'scratch'
+wandb_log = False
+wandb_project = 'reflow-1-small'
+wandb_run_name = 'reflow-1-small'
+dataset = 'openwebtext'
+gradient_accumulation_steps = 64
+batch_size = 1
+block_size = 1024
+n_layer = 6
+n_head = 8
+n_embd = 512
+n_signals = 512
+vocab_size = 50304
+dropout = 0.0
+bias = False
+learning_rate = 1.5e-4
+max_iters = 50000
+weight_decay = 0.1
+beta1 = 0.9
+beta2 = 0.95
+grad_clip = 1.0
+decay_lr = False
+warmup_iters = 2000
+lr_decay_iters = 600000
+min_lr = 6e-5
+backend = 'nccl'
+device = 'cuda'
+dtype = 'float16'
+compile = True

config/train_reflow_1_small_resume.py ADDED Viewed

	@@ -0,0 +1,46 @@

+model_config = 'reflow'
+log_file = 'logs/reflow-1-small.log'
+out_dir = 'out/reflow-1-small'
+eval_interval = 500
+log_interval = 1
+eval_iters = 500
+eval_only = False
+always_save_checkpoint = False
+init_from = 'resume'
+wandb_log = False
+wandb_project = 'reflow-1-small'
+wandb_run_name = 'reflow-1-small'
+dataset = 'openwebtext'
+gradient_accumulation_steps = 64
+batch_size = 1
+block_size = 1024
+n_layer = 6
+n_head = 8
+n_embd = 512
+n_signals = 512
+vocab_size = 50304
+dropout = 0.0
+bias = False
+learning_rate = 1.5e-4
+max_iters = 50000
+weight_decay = 0.1
+beta1 = 0.9
+beta2 = 0.95
+grad_clip = 1.0
+decay_lr = False
+warmup_iters = 2000
+lr_decay_iters = 600000
+min_lr = 6e-5
+backend = 'nccl'
+device = 'cuda'
+dtype = 'float16'
+compile = True

config/train_reflow_1_small_sp.py ADDED Viewed

	@@ -0,0 +1,46 @@

+model_config = 'reflow'
+log_file = 'logs/reflow-1-small-sp.log'
+out_dir = 'out/reflow-1-small-sp'
+eval_interval = 500
+log_interval = 1
+eval_iters = 500
+eval_only = False
+always_save_checkpoint = False
+init_from = 'scratch'
+wandb_log = False
+wandb_project = 'reflow-1-small-sp'
+wandb_run_name = 'reflow-1-small-sp'
+dataset = 'openwebtext'
+gradient_accumulation_steps = 64
+batch_size = 1
+block_size = 1024
+n_layer = 12
+n_head = 6
+n_embd = 384
+n_signals = 384
+vocab_size = 50304
+dropout = 0.0
+bias = False
+learning_rate = 1.5e-4
+max_iters = 50000
+weight_decay = 0.1
+beta1 = 0.9
+beta2 = 0.95
+grad_clip = 1.0
+decay_lr = False
+warmup_iters = 2000
+lr_decay_iters = 600000
+min_lr = 6e-5
+backend = 'nccl'
+device = 'cuda'
+dtype = 'float16'
+compile = True

config/train_reflow_1_small_sp_resume.py ADDED Viewed

	@@ -0,0 +1,46 @@

+model_config = 'reflow'
+log_file = 'logs/reflow-1-small-sp.log'
+out_dir = 'out/reflow-1-small-sp'
+eval_interval = 500
+log_interval = 1
+eval_iters = 500
+eval_only = False
+always_save_checkpoint = False
+init_from = 'resume'
+wandb_log = False
+wandb_project = 'reflow-1-small-sp'
+wandb_run_name = 'reflow-1-small-sp'
+dataset = 'openwebtext'
+gradient_accumulation_steps = 64
+batch_size = 1
+block_size = 1024
+n_layer = 12
+n_head = 6
+n_embd = 384
+n_signals = 384
+vocab_size = 50304
+dropout = 0.0
+bias = False
+learning_rate = 1.5e-4
+max_iters = 50000
+weight_decay = 0.1
+beta1 = 0.9
+beta2 = 0.95
+grad_clip = 1.0
+decay_lr = False
+warmup_iters = 2000
+lr_decay_iters = 600000
+min_lr = 6e-5
+backend = 'nccl'
+device = 'cuda'
+dtype = 'float16'
+compile = True

config/train_reflow_1_topk.py ADDED Viewed

	@@ -0,0 +1,48 @@

+model_config = 'reflow-topk'
+log_file = 'logs/reflow-1-topk.log'
+out_dir = 'out/reflow-1-topk'
+eval_interval = 500
+log_interval = 1
+eval_iters = 500
+eval_only = False
+always_save_checkpoint = False
+init_from = 'scratch'
+wandb_log = False
+wandb_project = 'reflow-1-topk'
+wandb_run_name = 'reflow-1-topk'
+dataset = 'openwebtext'
+gradient_accumulation_steps = 64
+batch_size = 1
+block_size = 1024
+n_layer = 6
+n_head = 8
+n_embd = 512
+n_signals = 512
+vocab_size = 50304
+dropout = 0.0
+bias = False
+max_active_signals = 32
+sparsity_penalty = 1e-3
+learning_rate = 3e-4
+max_iters = 50000
+weight_decay = 1e-1
+beta1 = 0.9
+beta2 = 0.95
+grad_clip = 1.0
+decay_lr = True
+warmup_iters = 1000
+lr_decay_iters = 50000
+min_lr = 3e-5
+backend = 'nccl'
+device = 'cuda'
+dtype = 'float16'
+compile = True

config/train_reflow_1_topk_big.py ADDED Viewed

	@@ -0,0 +1,48 @@

+model_config = 'reflow-topk'
+log_file = 'logs/reflow-1-topk-big.log'
+out_dir = 'out/reflow-1-topk-big'
+eval_interval = 500
+log_interval = 1
+eval_iters = 500
+eval_only = False
+always_save_checkpoint = False
+init_from = 'scratch'
+wandb_log = False
+wandb_project = 'reflow-1-topk-big'
+wandb_run_name = 'reflow-1-topk-big'
+dataset = 'openwebtext'
+gradient_accumulation_steps = 64
+batch_size = 1
+block_size = 1024
+n_layer = 36
+n_head = 16
+n_embd = 1024
+n_signals = 1024
+vocab_size = 50304
+dropout = 0.0
+bias = False
+max_active_signals = 64
+sparsity_penalty = 1e-3
+learning_rate = 3e-4
+max_iters = 50000
+weight_decay = 1e-1
+beta1 = 0.9
+beta2 = 0.95
+grad_clip = 1.0
+decay_lr = True
+warmup_iters = 1000
+lr_decay_iters = 50000
+min_lr = 3e-5
+backend = 'nccl'
+device = 'cuda'
+dtype = 'float16'
+compile = True

config/train_reflow_1_topk_big_resume.py ADDED Viewed

	@@ -0,0 +1,48 @@

+model_config = 'reflow-topk'
+log_file = 'logs/reflow-1-topk-big.log'
+out_dir = 'out/reflow-1-topk-big'
+eval_interval = 500
+log_interval = 1
+eval_iters = 500
+eval_only = False
+always_save_checkpoint = False
+init_from = 'resume'
+wandb_log = False
+wandb_project = 'reflow-1-topk-big'
+wandb_run_name = 'reflow-1-topk-big'
+dataset = 'openwebtext'
+gradient_accumulation_steps = 64
+batch_size = 1
+block_size = 1024
+n_layer = 36
+n_head = 16
+n_embd = 1024
+n_signals = 1024
+vocab_size = 50304
+dropout = 0.0
+bias = False
+max_active_signals = 64
+sparsity_penalty = 1e-3
+learning_rate = 3e-4
+max_iters = 50000
+weight_decay = 1e-1
+beta1 = 0.9
+beta2 = 0.95
+grad_clip = 1.0
+decay_lr = True
+warmup_iters = 1000
+lr_decay_iters = 50000
+min_lr = 3e-5
+backend = 'nccl'
+device = 'cuda'
+dtype = 'float16'
+compile = True

config/train_reflow_base.py ADDED Viewed

	@@ -0,0 +1,43 @@

+model_config = 'reflow'
+out_dir = 'out-reflow'
+eval_interval = 250
+log_interval = 10
+eval_iters = 200
+eval_only = False
+always_save_checkpoint = False
+init_from = 'scratch'
+wandb_log = False
+wandb_project = 'reflow'
+wandb_run_name = 'reflow'
+dataset = 'openwebtext'
+gradient_accumulation_steps = 64
+batch_size = 1
+block_size = 1024
+n_layer = 32
+n_head = 16
+n_embd = 1024
+n_signals = 1024
+dropout = 0.0
+bias = False
+learning_rate = 1.5e-4
+max_iters = 50000
+weight_decay = 0.1
+beta1 = 0.9
+beta2 = 0.95
+grad_clip = 1.0
+decay_lr = False
+warmup_iters = 2000
+lr_decay_iters = 600000
+min_lr = 6e-5
+backend = 'nccl'
+device = 'cuda'
+dtype = 'float16'
+compile = True

config/train_resume.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import time
+model_config = 'gpt2'
+out_dir = 'out-web'
+eval_interval = 50
+log_interval = 1
+eval_iters = 40
+eval_only = False
+always_save_checkpoint = False
+wandb_log = True
+wandb_project = 'resume'
+wandb_run_name = 'resume-' + str(time.time())
+dataset = 'openwebtext'
+gradient_accumulation_steps = 64
+batch_size = 1
+block_size = 1024
+n_layer = 12
+n_head = 12
+n_embd = 768
+dropout = 0.0
+bias = False
+learning_rate = 1e-6
+max_iters = 20
+weight_decay = 0.1
+beta1 = 0.9
+beta2 = 0.95
+grad_clip = 1.0
+decay_lr = True
+warmup_iters = 0
+lr_decay_iters = 20
+min_lr = 1e-6
+backend = 'nccl'
+device = 'cuda'
+dtype = 'bfloat16'
+compile = True
+init_from = 'resume'

config/train_sft.py ADDED Viewed

	@@ -0,0 +1,42 @@

+model_config = 'gpt2'
+out_dir = 'out-sft'
+base_model_dir = 'out'
+init_from = 'finetune'
+eval_interval = 250
+log_interval = 1
+eval_iters = 40
+eval_only = False
+always_save_checkpoint = True
+dataset = 'alpaca'
+gradient_accumulation_steps = 4
+batch_size = 4
+block_size = 1024
+n_layer = 12
+n_head = 12
+n_embd = 768
+dropout = 0.1
+bias = False
+learning_rate = 2e-5
+max_iters = 3000
+weight_decay = 0.1
+beta1 = 0.9
+beta2 = 0.95
+grad_clip = 1.0
+decay_lr = True
+warmup_iters = 100
+lr_decay_iters = 3000
+min_lr = 1e-6
+sft_masking = True
+backend = 'nccl'
+device = 'cuda'
+dtype = 'bfloat16'
+compile = True

config/train_shakespeare_char.py ADDED Viewed

	@@ -0,0 +1,42 @@

+model_config = 'gpt2'
+out_dir = 'out-shakespeare-char'
+eval_interval = 250
+log_interval = 10
+eval_iters = 200
+eval_only = False
+always_save_checkpoint = False
+wandb_log = False
+wandb_project = 'shakespeare-char'
+wandb_run_name = 'mini-gpt'
+dataset = 'shakespeare_char'
+gradient_accumulation_steps = 1
+batch_size = 64
+block_size = 256
+n_layer = 6
+n_head = 6
+n_embd = 384
+dropout = 0.2
+bias = False
+learning_rate = 1e-3
+max_iters = 5000
+weight_decay = 1e-1
+beta1 = 0.9
+beta2 = 0.99
+grad_clip = 1.0
+decay_lr = True
+warmup_iters = 100
+lr_decay_iters = 5000
+min_lr = 1e-4
+backend = 'nccl'
+device = 'cuda'
+dtype = 'bfloat16'
+compile = True
+init_from = 'scratch'