Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/.hydra/config.yaml +56 -0
- logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/.hydra/hydra.yaml +168 -0
- logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/.hydra/overrides.yaml +26 -0
- logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_100.pth +3 -0
- logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_1000.pth +3 -0
- logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_10000.pth +3 -0
- logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_1100.pth +3 -0
- logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_1200.pth +3 -0
- logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_1300.pth +3 -0
- logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_1400.pth +3 -0
- logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_1500.pth +3 -0
- logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_1600.pth +3 -0
- logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_1700.pth +3 -0
- logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_1800.pth +3 -0
- logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_1900.pth +3 -0
- logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_200.pth +3 -0
- logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_2000.pth +3 -0
- logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_2100.pth +3 -0
- logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_2200.pth +3 -0
- logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_2300.pth +3 -0
- logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_2400.pth +3 -0
- logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_2500.pth +3 -0
- logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_2600.pth +3 -0
- logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_2700.pth +3 -0
- logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_2800.pth +3 -0
- logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_2900.pth +3 -0
- logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_300.pth +3 -0
- logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_3000.pth +3 -0
- logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_3100.pth +3 -0
- logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_3200.pth +3 -0
- logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_3300.pth +3 -0
- logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_3400.pth +3 -0
- logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_3500.pth +3 -0
- logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_3600.pth +3 -0
- logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_3700.pth +3 -0
- logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_3800.pth +3 -0
- logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_3900.pth +3 -0
- logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_400.pth +3 -0
- logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_4000.pth +3 -0
- logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_4100.pth +3 -0
- logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_4200.pth +3 -0
- logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_4300.pth +3 -0
- logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_4400.pth +3 -0
- logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_4500.pth +3 -0
- logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_4600.pth +3 -0
- logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_4700.pth +3 -0
- logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_4800.pth +3 -0
- logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_4900.pth +3 -0
- logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_500.pth +3 -0
- logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_5000.pth +3 -0
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/.hydra/config.yaml
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
dataset:
|
| 2 |
+
name: fineweb
|
| 3 |
+
root: /mnt/hdfs/__MERLIN_USER_DIR__/data/fineweb
|
| 4 |
+
seed: 42
|
| 5 |
+
size: 10000000000
|
| 6 |
+
input_bin: data/fineweb/fineweb10B/fineweb_train_*.bin
|
| 7 |
+
input_val_bin: data/fineweb/fineweb10B/fineweb_val_*.bin
|
| 8 |
+
model:
|
| 9 |
+
name: gpt2_small
|
| 10 |
+
n_embd: 768
|
| 11 |
+
n_head: 12
|
| 12 |
+
n_layers: 12
|
| 13 |
+
vocab_size: 50257
|
| 14 |
+
rotary: true
|
| 15 |
+
auto_regressive: true
|
| 16 |
+
swiglu: false
|
| 17 |
+
tie_embedding_and_output_weights: true
|
| 18 |
+
attn_output_gate: true
|
| 19 |
+
use_qk_norm: true
|
| 20 |
+
training:
|
| 21 |
+
seed: 42
|
| 22 |
+
steps: 10000
|
| 23 |
+
sequence_length: 1024
|
| 24 |
+
max_global_steps: null
|
| 25 |
+
max_local_steps: null
|
| 26 |
+
compile: true
|
| 27 |
+
val: true
|
| 28 |
+
val_max_steps: 50
|
| 29 |
+
log_every: 1
|
| 30 |
+
val_every: 200
|
| 31 |
+
save_every: 100
|
| 32 |
+
optimizer:
|
| 33 |
+
name: adamw
|
| 34 |
+
lr: 0.0018
|
| 35 |
+
beta1: 0.9
|
| 36 |
+
beta2: 0.95
|
| 37 |
+
eps: 1.0e-08
|
| 38 |
+
weight_decay: 0.1
|
| 39 |
+
batch_size: 64
|
| 40 |
+
minibatch_size: 32
|
| 41 |
+
scheduler:
|
| 42 |
+
name: wsd
|
| 43 |
+
warmup_steps: 400
|
| 44 |
+
start_steps: 8000
|
| 45 |
+
end_steps: 10000
|
| 46 |
+
gamma: 0
|
| 47 |
+
type: nlp
|
| 48 |
+
project_name: stochastic-eos
|
| 49 |
+
exp_name: nlp-eos
|
| 50 |
+
run_name: ${mk_run_name:${type},${dataset},${model},${training},${now:%y%m%d-%H%M%S}}
|
| 51 |
+
wandb:
|
| 52 |
+
use: true
|
| 53 |
+
project: seos-nlp
|
| 54 |
+
name: ${mk_run_name_short:${type},${dataset},${model},${training},${now:%y%m%d-%H%M%S}}
|
| 55 |
+
update_hdfs: true
|
| 56 |
+
remote_root: hdfs://haruna/home/byte_data_seed/ssd_hldy/user/yuhang.cai/stochastic-eos
|
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/.hydra/hydra.yaml
ADDED
|
@@ -0,0 +1,168 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
hydra:
|
| 2 |
+
run:
|
| 3 |
+
dir: logs/${project_name}/${exp_name}/${run_name}
|
| 4 |
+
sweep:
|
| 5 |
+
dir: logs/${project_name}/${exp_name}/multirun/${run_name}
|
| 6 |
+
subdir: ${hydra.job.num}
|
| 7 |
+
launcher:
|
| 8 |
+
_target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
|
| 9 |
+
sweeper:
|
| 10 |
+
_target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
|
| 11 |
+
max_batch_size: null
|
| 12 |
+
params: null
|
| 13 |
+
help:
|
| 14 |
+
app_name: ${hydra.job.name}
|
| 15 |
+
header: '${hydra.help.app_name} is powered by Hydra.
|
| 16 |
+
|
| 17 |
+
'
|
| 18 |
+
footer: 'Powered by Hydra (https://hydra.cc)
|
| 19 |
+
|
| 20 |
+
Use --hydra-help to view Hydra specific help
|
| 21 |
+
|
| 22 |
+
'
|
| 23 |
+
template: '${hydra.help.header}
|
| 24 |
+
|
| 25 |
+
== Configuration groups ==
|
| 26 |
+
|
| 27 |
+
Compose your configuration from those groups (group=option)
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
$APP_CONFIG_GROUPS
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
== Config ==
|
| 34 |
+
|
| 35 |
+
Override anything in the config (foo.bar=value)
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
$CONFIG
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
${hydra.help.footer}
|
| 42 |
+
|
| 43 |
+
'
|
| 44 |
+
hydra_help:
|
| 45 |
+
template: 'Hydra (${hydra.runtime.version})
|
| 46 |
+
|
| 47 |
+
See https://hydra.cc for more info.
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
== Flags ==
|
| 51 |
+
|
| 52 |
+
$FLAGS_HELP
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
== Configuration groups ==
|
| 56 |
+
|
| 57 |
+
Compose your configuration from those groups (For example, append hydra/job_logging=disabled
|
| 58 |
+
to command line)
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
$HYDRA_CONFIG_GROUPS
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
Use ''--cfg hydra'' to Show the Hydra config.
|
| 65 |
+
|
| 66 |
+
'
|
| 67 |
+
hydra_help: ???
|
| 68 |
+
hydra_logging:
|
| 69 |
+
version: 1
|
| 70 |
+
formatters:
|
| 71 |
+
simple:
|
| 72 |
+
format: '[%(asctime)s][HYDRA] %(message)s'
|
| 73 |
+
handlers:
|
| 74 |
+
console:
|
| 75 |
+
class: logging.StreamHandler
|
| 76 |
+
formatter: simple
|
| 77 |
+
stream: ext://sys.stdout
|
| 78 |
+
root:
|
| 79 |
+
level: INFO
|
| 80 |
+
handlers:
|
| 81 |
+
- console
|
| 82 |
+
loggers:
|
| 83 |
+
logging_example:
|
| 84 |
+
level: DEBUG
|
| 85 |
+
disable_existing_loggers: false
|
| 86 |
+
job_logging:
|
| 87 |
+
version: 1
|
| 88 |
+
root:
|
| 89 |
+
level: ERROR
|
| 90 |
+
disable_existing_loggers: true
|
| 91 |
+
env: {}
|
| 92 |
+
mode: RUN
|
| 93 |
+
searchpath: []
|
| 94 |
+
callbacks: {}
|
| 95 |
+
output_subdir: .hydra
|
| 96 |
+
overrides:
|
| 97 |
+
hydra:
|
| 98 |
+
- hydra.mode=RUN
|
| 99 |
+
task:
|
| 100 |
+
- dataset=fineweb
|
| 101 |
+
- dataset.input_bin=data/fineweb/fineweb10B/fineweb_train_*.bin
|
| 102 |
+
- dataset.input_val_bin=data/fineweb/fineweb10B/fineweb_val_*.bin
|
| 103 |
+
- model=gpt2_small
|
| 104 |
+
- training=adamw_nlp
|
| 105 |
+
- wandb.use=true
|
| 106 |
+
- training.seed=42
|
| 107 |
+
- training.log_every=1
|
| 108 |
+
- training.val_every=200
|
| 109 |
+
- training.save_every=100
|
| 110 |
+
- training.steps=10000
|
| 111 |
+
- training.optimizer.name=adamw
|
| 112 |
+
- training.optimizer.lr=0.0018
|
| 113 |
+
- training.optimizer.weight_decay=0.1
|
| 114 |
+
- training.optimizer.beta1=0.9
|
| 115 |
+
- training.optimizer.beta2=0.95
|
| 116 |
+
- training.optimizer.eps=1e-8
|
| 117 |
+
- training.optimizer.batch_size=64
|
| 118 |
+
- training.optimizer.minibatch_size=32
|
| 119 |
+
- training.compile=true
|
| 120 |
+
- training.sequence_length=1024
|
| 121 |
+
- training.scheduler.name=wsd
|
| 122 |
+
- training.scheduler.warmup_steps=400
|
| 123 |
+
- training.scheduler.start_steps=8000
|
| 124 |
+
- training.scheduler.end_steps=10000
|
| 125 |
+
- training.scheduler.gamma=0
|
| 126 |
+
job:
|
| 127 |
+
name: train_nlp
|
| 128 |
+
chdir: null
|
| 129 |
+
override_dirname: dataset.input_bin=data/fineweb/fineweb10B/fineweb_train_*.bin,dataset.input_val_bin=data/fineweb/fineweb10B/fineweb_val_*.bin,dataset=fineweb,model=gpt2_small,training.compile=true,training.log_every=1,training.optimizer.batch_size=64,training.optimizer.beta1=0.9,training.optimizer.beta2=0.95,training.optimizer.eps=1e-8,training.optimizer.lr=0.0018,training.optimizer.minibatch_size=32,training.optimizer.name=adamw,training.optimizer.weight_decay=0.1,training.save_every=100,training.scheduler.end_steps=10000,training.scheduler.gamma=0,training.scheduler.name=wsd,training.scheduler.start_steps=8000,training.scheduler.warmup_steps=400,training.seed=42,training.sequence_length=1024,training.steps=10000,training.val_every=200,training=adamw_nlp,wandb.use=true
|
| 130 |
+
id: ???
|
| 131 |
+
num: ???
|
| 132 |
+
config_name: config_nlp_pretrain
|
| 133 |
+
env_set: {}
|
| 134 |
+
env_copy: []
|
| 135 |
+
config:
|
| 136 |
+
override_dirname:
|
| 137 |
+
kv_sep: '='
|
| 138 |
+
item_sep: ','
|
| 139 |
+
exclude_keys: []
|
| 140 |
+
runtime:
|
| 141 |
+
version: 1.3.2
|
| 142 |
+
version_base: '1.3'
|
| 143 |
+
cwd: /data01/home/yuhang.cai/Stochastic-EoS
|
| 144 |
+
config_sources:
|
| 145 |
+
- path: hydra.conf
|
| 146 |
+
schema: pkg
|
| 147 |
+
provider: hydra
|
| 148 |
+
- path: /data01/home/yuhang.cai/Stochastic-EoS/configs
|
| 149 |
+
schema: file
|
| 150 |
+
provider: main
|
| 151 |
+
- path: ''
|
| 152 |
+
schema: structured
|
| 153 |
+
provider: schema
|
| 154 |
+
output_dir: /data01/home/yuhang.cai/Stochastic-EoS/logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438
|
| 155 |
+
choices:
|
| 156 |
+
training: adamw_nlp
|
| 157 |
+
model: gpt2_small
|
| 158 |
+
dataset: fineweb
|
| 159 |
+
hydra/env: default
|
| 160 |
+
hydra/callbacks: null
|
| 161 |
+
hydra/job_logging: disabled
|
| 162 |
+
hydra/hydra_logging: default
|
| 163 |
+
hydra/hydra_help: default
|
| 164 |
+
hydra/help: default
|
| 165 |
+
hydra/sweeper: basic
|
| 166 |
+
hydra/launcher: basic
|
| 167 |
+
hydra/output: default
|
| 168 |
+
verbose: false
|
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/.hydra/overrides.yaml
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
- dataset=fineweb
|
| 2 |
+
- dataset.input_bin=data/fineweb/fineweb10B/fineweb_train_*.bin
|
| 3 |
+
- dataset.input_val_bin=data/fineweb/fineweb10B/fineweb_val_*.bin
|
| 4 |
+
- model=gpt2_small
|
| 5 |
+
- training=adamw_nlp
|
| 6 |
+
- wandb.use=true
|
| 7 |
+
- training.seed=42
|
| 8 |
+
- training.log_every=1
|
| 9 |
+
- training.val_every=200
|
| 10 |
+
- training.save_every=100
|
| 11 |
+
- training.steps=10000
|
| 12 |
+
- training.optimizer.name=adamw
|
| 13 |
+
- training.optimizer.lr=0.0018
|
| 14 |
+
- training.optimizer.weight_decay=0.1
|
| 15 |
+
- training.optimizer.beta1=0.9
|
| 16 |
+
- training.optimizer.beta2=0.95
|
| 17 |
+
- training.optimizer.eps=1e-8
|
| 18 |
+
- training.optimizer.batch_size=64
|
| 19 |
+
- training.optimizer.minibatch_size=32
|
| 20 |
+
- training.compile=true
|
| 21 |
+
- training.sequence_length=1024
|
| 22 |
+
- training.scheduler.name=wsd
|
| 23 |
+
- training.scheduler.warmup_steps=400
|
| 24 |
+
- training.scheduler.start_steps=8000
|
| 25 |
+
- training.scheduler.end_steps=10000
|
| 26 |
+
- training.scheduler.gamma=0
|
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_100.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:77f671f1878098c5a11eac28fcc27dd281151896a6a552c54f60510e1dc255a2
|
| 3 |
+
size 2031036461
|
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_1000.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fe10e6833b87f80f23cc95a25b820d3424c4b731f5fe5aa01f71acf5380320a0
|
| 3 |
+
size 2031037067
|
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_10000.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1bea4ffb13139e5ee11a361dd7fc73589d4d05ce6b4259ec50f6a4143145e848
|
| 3 |
+
size 2031037673
|
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_1100.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fa14d665faf6509e845fc44b2124557995681b41124a4b71254738cda5feaf9a
|
| 3 |
+
size 2031037067
|
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_1200.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:98fd37ab0c372da734f1aa047320438c7205819e5682acb7b87cc463301fd8d3
|
| 3 |
+
size 2031037067
|
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_1300.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cf767a9345ed6f62e83621052a67dc9eeff0ae652bb034249994b1a746848597
|
| 3 |
+
size 2031037067
|
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_1400.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:868dc8092b01f501fea5da56f3d45109d35ff8371081048c40397d97d8b4247f
|
| 3 |
+
size 2031037067
|
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_1500.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e7ed12a4e914f158a1a2e1b9b8bd1d92060599ad082d43a06c911193b374a4ff
|
| 3 |
+
size 2031037067
|
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_1600.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b6882ad82f64df7f099c5f3d354861a36a4ad16203373b3a8be8f1065e24ae93
|
| 3 |
+
size 2031037067
|
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_1700.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0f035051c0998108bab5a2cd9ebb48965741d9bb36bfcac0dfe8ce079e2a87b5
|
| 3 |
+
size 2031037067
|
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_1800.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d2350f36f1bfef9fc095e4cf341d9f1eb12a77a9f9ada031d4db2efa87afe4a3
|
| 3 |
+
size 2031037067
|
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_1900.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6146c92a9317a8e9246b42e27ee41ae027bd4b7c275ce1b5ab4920ee1847e554
|
| 3 |
+
size 2031037067
|
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_200.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:94bfc0cfae168dab8e2a17cf15bc0f92b0e9388175992a701a67182f6ff2bdb7
|
| 3 |
+
size 2031036461
|
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_2000.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cc900cc93306f9b4c4f85da37d9fd589475fdab56dcdde0b4d0087076d11f3e2
|
| 3 |
+
size 2031037067
|
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_2100.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bf219bc1711b9d11ac4f48fcf42f57913d9beabca4d201269e926288b19f75ed
|
| 3 |
+
size 2031037067
|
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_2200.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:520155cbd6933f3b18d51468f81ccae32c932eb4817edd61113e7fa28c692649
|
| 3 |
+
size 2031037067
|
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_2300.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2e7c5ce56e8f323feccf3c32d5434457b42470a9e0a10683faee9877871cacfe
|
| 3 |
+
size 2031037067
|
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_2400.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:aec8403fa2877b9f4f3982322b46060cad09817b502222251809c7bd0563de7e
|
| 3 |
+
size 2031037067
|
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_2500.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:70a2acd285760f5e10fde52a3396f3356015b7f36bf8b9ff6951989bcf6ad07c
|
| 3 |
+
size 2031037067
|
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_2600.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7f1177fc94c65cf755f52d9389bf4eceb7da2719d636dd7567d99b6ffd9702c4
|
| 3 |
+
size 2031037067
|
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_2700.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c62fd999e2310584c7099789e5394dc6b0349b77c2217432b3423d00af1aa644
|
| 3 |
+
size 2031037067
|
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_2800.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:14db0f51c1ed12a7e3b68fd6b81fa565618adbab724b8046e028d5846b021cc1
|
| 3 |
+
size 2031037067
|
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_2900.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:10c9d9cf83751a5fbdd572ef04255dc1b871aafea4a061f5ff07d5ed646b6ebc
|
| 3 |
+
size 2031037067
|
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_300.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6ef760ad94a2be7b4a05bdc58968a46c5dd2a1412729f4e87bc3859cc481db13
|
| 3 |
+
size 2031036461
|
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_3000.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f4c628c86f1c6113fbec2ded90d2edbf5c25706441acc895990bc78e8da28f0f
|
| 3 |
+
size 2031037067
|
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_3100.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:79a6965e63494b9406697aada4c4ebacad24016bcdf204f2c08ce20a355ba686
|
| 3 |
+
size 2031037067
|
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_3200.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d58ad203e5ac948a3935a308c537b189b36d5a565f2e40f78c55794b63aa617f
|
| 3 |
+
size 2031037067
|
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_3300.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7767c47bfd26f9f29b18197919513762dfabdd02a9a1b2d558e250625821ddea
|
| 3 |
+
size 2031037067
|
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_3400.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4f7fc949b98e61bc872416d4df76872e0936742e53f129065f8c3cfe446cd1d7
|
| 3 |
+
size 2031037067
|
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_3500.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7cdd784aac9809b4ecd293cd07c3d82c48cb7d320674712aedcec3e647f817c1
|
| 3 |
+
size 2031037067
|
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_3600.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cb49c6bfc38666d70c34b820a62c883bcd7d3465d73a4efd66d3bc4a8f2ee2bf
|
| 3 |
+
size 2031037067
|
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_3700.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e8c75345429b5cc64b65cfc19bbaaf284f22846f4a8eb03bb3441e15dc228689
|
| 3 |
+
size 2031037067
|
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_3800.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:90addd1d091564835c51b609aeef0f3f0b6eb551befa36990c107e9f5bddf8d4
|
| 3 |
+
size 2031037067
|
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_3900.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5714ef9b103d9da7ce534b2f705d20af995cdc64da66254911e7418968260a56
|
| 3 |
+
size 2031037067
|
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_400.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:35f87d552adf8010b4acc914a60af05ef20c80001f3e0eab31e02da8040658ff
|
| 3 |
+
size 2031036461
|
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_4000.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:18886b6298ae422c9a95e1675799e9ebbd5cd0ae63c87001cbfad6c53caca468
|
| 3 |
+
size 2031037067
|
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_4100.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:aebc2b5fd1345502e6a6b24dded7d05802e200dedee53510b351ef04c0f17516
|
| 3 |
+
size 2031037067
|
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_4200.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fc5a2cbd5e098bbd5e5994ee13c72766973cf9dacab0a66e03775790e2ff4e17
|
| 3 |
+
size 2031037067
|
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_4300.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c6d1275a17091ee545b2efeeba5c6ee4d8199171291d0e6dcbe11f1f7d245368
|
| 3 |
+
size 2031037067
|
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_4400.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9e4d6db73da3e7ede8659cf0c080107e14c7783cbb2550a051790fe1819e6764
|
| 3 |
+
size 2031037067
|
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_4500.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b130671ee783053f2759f3afa418d011972b1722161e59bca488d9f2f66d139a
|
| 3 |
+
size 2031037067
|
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_4600.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:55e04a4ed522ff27a5aa3df8d9bf0fb572954eba01b6e3b9ddc26f3a7fe0e35b
|
| 3 |
+
size 2031037067
|
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_4700.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8d64168199cb94e316ed7f84fe1c2445113ee5bdb9a7e66425ac1a2e8429001e
|
| 3 |
+
size 2031037067
|
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_4800.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:af2b30e7f0df4d0461aae75e3fbd67d03bac7cd5d10064ff6cf6b4baa3f48309
|
| 3 |
+
size 2031037067
|
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_4900.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fdc35cd922268a0b5738a48836281482e96396780b937c4e11f4a4d112566968
|
| 3 |
+
size 2031037067
|
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_500.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:97ca9e6b997b09123184d0c55f877590806eaeba319a4fe171d5d5dd903e9ac3
|
| 3 |
+
size 2031036461
|
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_5000.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c3b311277c115606076228e320a44569d5a2a6acccc6f9c38e48204b19211c5f
|
| 3 |
+
size 2031037067
|