faro1219 commited on
Commit
938160b
·
verified ·
1 Parent(s): 64d0b77

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/.hydra/config.yaml +56 -0
  2. logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/.hydra/hydra.yaml +168 -0
  3. logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/.hydra/overrides.yaml +26 -0
  4. logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_100.pth +3 -0
  5. logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_1000.pth +3 -0
  6. logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_10000.pth +3 -0
  7. logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_1100.pth +3 -0
  8. logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_1200.pth +3 -0
  9. logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_1300.pth +3 -0
  10. logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_1400.pth +3 -0
  11. logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_1500.pth +3 -0
  12. logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_1600.pth +3 -0
  13. logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_1700.pth +3 -0
  14. logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_1800.pth +3 -0
  15. logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_1900.pth +3 -0
  16. logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_200.pth +3 -0
  17. logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_2000.pth +3 -0
  18. logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_2100.pth +3 -0
  19. logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_2200.pth +3 -0
  20. logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_2300.pth +3 -0
  21. logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_2400.pth +3 -0
  22. logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_2500.pth +3 -0
  23. logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_2600.pth +3 -0
  24. logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_2700.pth +3 -0
  25. logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_2800.pth +3 -0
  26. logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_2900.pth +3 -0
  27. logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_300.pth +3 -0
  28. logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_3000.pth +3 -0
  29. logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_3100.pth +3 -0
  30. logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_3200.pth +3 -0
  31. logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_3300.pth +3 -0
  32. logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_3400.pth +3 -0
  33. logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_3500.pth +3 -0
  34. logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_3600.pth +3 -0
  35. logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_3700.pth +3 -0
  36. logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_3800.pth +3 -0
  37. logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_3900.pth +3 -0
  38. logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_400.pth +3 -0
  39. logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_4000.pth +3 -0
  40. logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_4100.pth +3 -0
  41. logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_4200.pth +3 -0
  42. logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_4300.pth +3 -0
  43. logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_4400.pth +3 -0
  44. logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_4500.pth +3 -0
  45. logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_4600.pth +3 -0
  46. logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_4700.pth +3 -0
  47. logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_4800.pth +3 -0
  48. logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_4900.pth +3 -0
  49. logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_500.pth +3 -0
  50. logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_5000.pth +3 -0
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/.hydra/config.yaml ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset:
2
+ name: fineweb
3
+ root: /mnt/hdfs/__MERLIN_USER_DIR__/data/fineweb
4
+ seed: 42
5
+ size: 10000000000
6
+ input_bin: data/fineweb/fineweb10B/fineweb_train_*.bin
7
+ input_val_bin: data/fineweb/fineweb10B/fineweb_val_*.bin
8
+ model:
9
+ name: gpt2_small
10
+ n_embd: 768
11
+ n_head: 12
12
+ n_layers: 12
13
+ vocab_size: 50257
14
+ rotary: true
15
+ auto_regressive: true
16
+ swiglu: false
17
+ tie_embedding_and_output_weights: true
18
+ attn_output_gate: true
19
+ use_qk_norm: true
20
+ training:
21
+ seed: 42
22
+ steps: 10000
23
+ sequence_length: 1024
24
+ max_global_steps: null
25
+ max_local_steps: null
26
+ compile: true
27
+ val: true
28
+ val_max_steps: 50
29
+ log_every: 1
30
+ val_every: 200
31
+ save_every: 100
32
+ optimizer:
33
+ name: adamw
34
+ lr: 0.0018
35
+ beta1: 0.9
36
+ beta2: 0.95
37
+ eps: 1.0e-08
38
+ weight_decay: 0.1
39
+ batch_size: 64
40
+ minibatch_size: 32
41
+ scheduler:
42
+ name: wsd
43
+ warmup_steps: 400
44
+ start_steps: 8000
45
+ end_steps: 10000
46
+ gamma: 0
47
+ type: nlp
48
+ project_name: stochastic-eos
49
+ exp_name: nlp-eos
50
+ run_name: ${mk_run_name:${type},${dataset},${model},${training},${now:%y%m%d-%H%M%S}}
51
+ wandb:
52
+ use: true
53
+ project: seos-nlp
54
+ name: ${mk_run_name_short:${type},${dataset},${model},${training},${now:%y%m%d-%H%M%S}}
55
+ update_hdfs: true
56
+ remote_root: hdfs://haruna/home/byte_data_seed/ssd_hldy/user/yuhang.cai/stochastic-eos
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/.hydra/hydra.yaml ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hydra:
2
+ run:
3
+ dir: logs/${project_name}/${exp_name}/${run_name}
4
+ sweep:
5
+ dir: logs/${project_name}/${exp_name}/multirun/${run_name}
6
+ subdir: ${hydra.job.num}
7
+ launcher:
8
+ _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
9
+ sweeper:
10
+ _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
11
+ max_batch_size: null
12
+ params: null
13
+ help:
14
+ app_name: ${hydra.job.name}
15
+ header: '${hydra.help.app_name} is powered by Hydra.
16
+
17
+ '
18
+ footer: 'Powered by Hydra (https://hydra.cc)
19
+
20
+ Use --hydra-help to view Hydra specific help
21
+
22
+ '
23
+ template: '${hydra.help.header}
24
+
25
+ == Configuration groups ==
26
+
27
+ Compose your configuration from those groups (group=option)
28
+
29
+
30
+ $APP_CONFIG_GROUPS
31
+
32
+
33
+ == Config ==
34
+
35
+ Override anything in the config (foo.bar=value)
36
+
37
+
38
+ $CONFIG
39
+
40
+
41
+ ${hydra.help.footer}
42
+
43
+ '
44
+ hydra_help:
45
+ template: 'Hydra (${hydra.runtime.version})
46
+
47
+ See https://hydra.cc for more info.
48
+
49
+
50
+ == Flags ==
51
+
52
+ $FLAGS_HELP
53
+
54
+
55
+ == Configuration groups ==
56
+
57
+ Compose your configuration from those groups (For example, append hydra/job_logging=disabled
58
+ to command line)
59
+
60
+
61
+ $HYDRA_CONFIG_GROUPS
62
+
63
+
64
+ Use ''--cfg hydra'' to Show the Hydra config.
65
+
66
+ '
67
+ hydra_help: ???
68
+ hydra_logging:
69
+ version: 1
70
+ formatters:
71
+ simple:
72
+ format: '[%(asctime)s][HYDRA] %(message)s'
73
+ handlers:
74
+ console:
75
+ class: logging.StreamHandler
76
+ formatter: simple
77
+ stream: ext://sys.stdout
78
+ root:
79
+ level: INFO
80
+ handlers:
81
+ - console
82
+ loggers:
83
+ logging_example:
84
+ level: DEBUG
85
+ disable_existing_loggers: false
86
+ job_logging:
87
+ version: 1
88
+ root:
89
+ level: ERROR
90
+ disable_existing_loggers: true
91
+ env: {}
92
+ mode: RUN
93
+ searchpath: []
94
+ callbacks: {}
95
+ output_subdir: .hydra
96
+ overrides:
97
+ hydra:
98
+ - hydra.mode=RUN
99
+ task:
100
+ - dataset=fineweb
101
+ - dataset.input_bin=data/fineweb/fineweb10B/fineweb_train_*.bin
102
+ - dataset.input_val_bin=data/fineweb/fineweb10B/fineweb_val_*.bin
103
+ - model=gpt2_small
104
+ - training=adamw_nlp
105
+ - wandb.use=true
106
+ - training.seed=42
107
+ - training.log_every=1
108
+ - training.val_every=200
109
+ - training.save_every=100
110
+ - training.steps=10000
111
+ - training.optimizer.name=adamw
112
+ - training.optimizer.lr=0.0018
113
+ - training.optimizer.weight_decay=0.1
114
+ - training.optimizer.beta1=0.9
115
+ - training.optimizer.beta2=0.95
116
+ - training.optimizer.eps=1e-8
117
+ - training.optimizer.batch_size=64
118
+ - training.optimizer.minibatch_size=32
119
+ - training.compile=true
120
+ - training.sequence_length=1024
121
+ - training.scheduler.name=wsd
122
+ - training.scheduler.warmup_steps=400
123
+ - training.scheduler.start_steps=8000
124
+ - training.scheduler.end_steps=10000
125
+ - training.scheduler.gamma=0
126
+ job:
127
+ name: train_nlp
128
+ chdir: null
129
+ override_dirname: dataset.input_bin=data/fineweb/fineweb10B/fineweb_train_*.bin,dataset.input_val_bin=data/fineweb/fineweb10B/fineweb_val_*.bin,dataset=fineweb,model=gpt2_small,training.compile=true,training.log_every=1,training.optimizer.batch_size=64,training.optimizer.beta1=0.9,training.optimizer.beta2=0.95,training.optimizer.eps=1e-8,training.optimizer.lr=0.0018,training.optimizer.minibatch_size=32,training.optimizer.name=adamw,training.optimizer.weight_decay=0.1,training.save_every=100,training.scheduler.end_steps=10000,training.scheduler.gamma=0,training.scheduler.name=wsd,training.scheduler.start_steps=8000,training.scheduler.warmup_steps=400,training.seed=42,training.sequence_length=1024,training.steps=10000,training.val_every=200,training=adamw_nlp,wandb.use=true
130
+ id: ???
131
+ num: ???
132
+ config_name: config_nlp_pretrain
133
+ env_set: {}
134
+ env_copy: []
135
+ config:
136
+ override_dirname:
137
+ kv_sep: '='
138
+ item_sep: ','
139
+ exclude_keys: []
140
+ runtime:
141
+ version: 1.3.2
142
+ version_base: '1.3'
143
+ cwd: /data01/home/yuhang.cai/Stochastic-EoS
144
+ config_sources:
145
+ - path: hydra.conf
146
+ schema: pkg
147
+ provider: hydra
148
+ - path: /data01/home/yuhang.cai/Stochastic-EoS/configs
149
+ schema: file
150
+ provider: main
151
+ - path: ''
152
+ schema: structured
153
+ provider: schema
154
+ output_dir: /data01/home/yuhang.cai/Stochastic-EoS/logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438
155
+ choices:
156
+ training: adamw_nlp
157
+ model: gpt2_small
158
+ dataset: fineweb
159
+ hydra/env: default
160
+ hydra/callbacks: null
161
+ hydra/job_logging: disabled
162
+ hydra/hydra_logging: default
163
+ hydra/hydra_help: default
164
+ hydra/help: default
165
+ hydra/sweeper: basic
166
+ hydra/launcher: basic
167
+ hydra/output: default
168
+ verbose: false
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/.hydra/overrides.yaml ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ - dataset=fineweb
2
+ - dataset.input_bin=data/fineweb/fineweb10B/fineweb_train_*.bin
3
+ - dataset.input_val_bin=data/fineweb/fineweb10B/fineweb_val_*.bin
4
+ - model=gpt2_small
5
+ - training=adamw_nlp
6
+ - wandb.use=true
7
+ - training.seed=42
8
+ - training.log_every=1
9
+ - training.val_every=200
10
+ - training.save_every=100
11
+ - training.steps=10000
12
+ - training.optimizer.name=adamw
13
+ - training.optimizer.lr=0.0018
14
+ - training.optimizer.weight_decay=0.1
15
+ - training.optimizer.beta1=0.9
16
+ - training.optimizer.beta2=0.95
17
+ - training.optimizer.eps=1e-8
18
+ - training.optimizer.batch_size=64
19
+ - training.optimizer.minibatch_size=32
20
+ - training.compile=true
21
+ - training.sequence_length=1024
22
+ - training.scheduler.name=wsd
23
+ - training.scheduler.warmup_steps=400
24
+ - training.scheduler.start_steps=8000
25
+ - training.scheduler.end_steps=10000
26
+ - training.scheduler.gamma=0
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_100.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:77f671f1878098c5a11eac28fcc27dd281151896a6a552c54f60510e1dc255a2
3
+ size 2031036461
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_1000.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe10e6833b87f80f23cc95a25b820d3424c4b731f5fe5aa01f71acf5380320a0
3
+ size 2031037067
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_10000.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1bea4ffb13139e5ee11a361dd7fc73589d4d05ce6b4259ec50f6a4143145e848
3
+ size 2031037673
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_1100.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fa14d665faf6509e845fc44b2124557995681b41124a4b71254738cda5feaf9a
3
+ size 2031037067
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_1200.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:98fd37ab0c372da734f1aa047320438c7205819e5682acb7b87cc463301fd8d3
3
+ size 2031037067
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_1300.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cf767a9345ed6f62e83621052a67dc9eeff0ae652bb034249994b1a746848597
3
+ size 2031037067
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_1400.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:868dc8092b01f501fea5da56f3d45109d35ff8371081048c40397d97d8b4247f
3
+ size 2031037067
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_1500.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e7ed12a4e914f158a1a2e1b9b8bd1d92060599ad082d43a06c911193b374a4ff
3
+ size 2031037067
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_1600.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6882ad82f64df7f099c5f3d354861a36a4ad16203373b3a8be8f1065e24ae93
3
+ size 2031037067
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_1700.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0f035051c0998108bab5a2cd9ebb48965741d9bb36bfcac0dfe8ce079e2a87b5
3
+ size 2031037067
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_1800.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d2350f36f1bfef9fc095e4cf341d9f1eb12a77a9f9ada031d4db2efa87afe4a3
3
+ size 2031037067
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_1900.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6146c92a9317a8e9246b42e27ee41ae027bd4b7c275ce1b5ab4920ee1847e554
3
+ size 2031037067
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_200.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:94bfc0cfae168dab8e2a17cf15bc0f92b0e9388175992a701a67182f6ff2bdb7
3
+ size 2031036461
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_2000.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc900cc93306f9b4c4f85da37d9fd589475fdab56dcdde0b4d0087076d11f3e2
3
+ size 2031037067
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_2100.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf219bc1711b9d11ac4f48fcf42f57913d9beabca4d201269e926288b19f75ed
3
+ size 2031037067
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_2200.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:520155cbd6933f3b18d51468f81ccae32c932eb4817edd61113e7fa28c692649
3
+ size 2031037067
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_2300.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2e7c5ce56e8f323feccf3c32d5434457b42470a9e0a10683faee9877871cacfe
3
+ size 2031037067
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_2400.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aec8403fa2877b9f4f3982322b46060cad09817b502222251809c7bd0563de7e
3
+ size 2031037067
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_2500.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:70a2acd285760f5e10fde52a3396f3356015b7f36bf8b9ff6951989bcf6ad07c
3
+ size 2031037067
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_2600.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7f1177fc94c65cf755f52d9389bf4eceb7da2719d636dd7567d99b6ffd9702c4
3
+ size 2031037067
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_2700.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c62fd999e2310584c7099789e5394dc6b0349b77c2217432b3423d00af1aa644
3
+ size 2031037067
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_2800.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:14db0f51c1ed12a7e3b68fd6b81fa565618adbab724b8046e028d5846b021cc1
3
+ size 2031037067
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_2900.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:10c9d9cf83751a5fbdd572ef04255dc1b871aafea4a061f5ff07d5ed646b6ebc
3
+ size 2031037067
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_300.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6ef760ad94a2be7b4a05bdc58968a46c5dd2a1412729f4e87bc3859cc481db13
3
+ size 2031036461
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_3000.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f4c628c86f1c6113fbec2ded90d2edbf5c25706441acc895990bc78e8da28f0f
3
+ size 2031037067
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_3100.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:79a6965e63494b9406697aada4c4ebacad24016bcdf204f2c08ce20a355ba686
3
+ size 2031037067
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_3200.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d58ad203e5ac948a3935a308c537b189b36d5a565f2e40f78c55794b63aa617f
3
+ size 2031037067
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_3300.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7767c47bfd26f9f29b18197919513762dfabdd02a9a1b2d558e250625821ddea
3
+ size 2031037067
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_3400.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4f7fc949b98e61bc872416d4df76872e0936742e53f129065f8c3cfe446cd1d7
3
+ size 2031037067
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_3500.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7cdd784aac9809b4ecd293cd07c3d82c48cb7d320674712aedcec3e647f817c1
3
+ size 2031037067
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_3600.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cb49c6bfc38666d70c34b820a62c883bcd7d3465d73a4efd66d3bc4a8f2ee2bf
3
+ size 2031037067
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_3700.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e8c75345429b5cc64b65cfc19bbaaf284f22846f4a8eb03bb3441e15dc228689
3
+ size 2031037067
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_3800.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:90addd1d091564835c51b609aeef0f3f0b6eb551befa36990c107e9f5bddf8d4
3
+ size 2031037067
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_3900.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5714ef9b103d9da7ce534b2f705d20af995cdc64da66254911e7418968260a56
3
+ size 2031037067
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_400.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:35f87d552adf8010b4acc914a60af05ef20c80001f3e0eab31e02da8040658ff
3
+ size 2031036461
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_4000.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:18886b6298ae422c9a95e1675799e9ebbd5cd0ae63c87001cbfad6c53caca468
3
+ size 2031037067
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_4100.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aebc2b5fd1345502e6a6b24dded7d05802e200dedee53510b351ef04c0f17516
3
+ size 2031037067
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_4200.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fc5a2cbd5e098bbd5e5994ee13c72766973cf9dacab0a66e03775790e2ff4e17
3
+ size 2031037067
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_4300.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c6d1275a17091ee545b2efeeba5c6ee4d8199171291d0e6dcbe11f1f7d245368
3
+ size 2031037067
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_4400.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e4d6db73da3e7ede8659cf0c080107e14c7783cbb2550a051790fe1819e6764
3
+ size 2031037067
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_4500.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b130671ee783053f2759f3afa418d011972b1722161e59bca488d9f2f66d139a
3
+ size 2031037067
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_4600.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:55e04a4ed522ff27a5aa3df8d9bf0fb572954eba01b6e3b9ddc26f3a7fe0e35b
3
+ size 2031037067
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_4700.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8d64168199cb94e316ed7f84fe1c2445113ee5bdb9a7e66425ac1a2e8429001e
3
+ size 2031037067
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_4800.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:af2b30e7f0df4d0461aae75e3fbd67d03bac7cd5d10064ff6cf6b4baa3f48309
3
+ size 2031037067
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_4900.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fdc35cd922268a0b5738a48836281482e96396780b937c4e11f4a4d112566968
3
+ size 2031037067
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_500.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:97ca9e6b997b09123184d0c55f877590806eaeba319a4fe171d5d5dd903e9ac3
3
+ size 2031036461
logs/stochastic-eos/nlp-eos/next-token-prediction-pretrain_fineweb_gpt2_small-attnOgate-qkNorm_adamw-lr0.0018-b0.9-0.95-eps1e-08-wd0.1-bs64_wsd-10000-400-8000-10000-0_251127-155438/checkpoints/step_5000.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c3b311277c115606076228e320a44569d5a2a6acccc6f9c38e48204b19211c5f
3
+ size 2031037067