Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +3 -0
- 2024-09-23/06-36-18/.hydra/config.yaml +74 -0
- 2024-09-23/06-36-18/.hydra/hydra.yaml +154 -0
- 2024-09-23/06-36-18/.hydra/overrides.yaml +1 -0
- 2024-09-23/06-36-18/train.log +0 -0
- 2024-09-23/07-06-14/.hydra/config.yaml +74 -0
- 2024-09-23/07-06-14/.hydra/hydra.yaml +154 -0
- 2024-09-23/07-06-14/.hydra/overrides.yaml +1 -0
- 2024-09-23/07-06-14/train.log +0 -0
- 2024-09-23/08-39-13/.hydra/config.yaml +74 -0
- 2024-09-23/08-39-13/.hydra/hydra.yaml +154 -0
- 2024-09-23/08-39-13/.hydra/overrides.yaml +1 -0
- 2024-09-23/08-39-13/train.log +0 -0
- 2024-09-23/08-40-08/.hydra/config.yaml +74 -0
- 2024-09-23/08-40-08/.hydra/hydra.yaml +154 -0
- 2024-09-23/08-40-08/.hydra/overrides.yaml +1 -0
- 2024-09-23/08-40-08/train.log +0 -0
- 2024-09-23/08-40-08/wandb/debug-internal.log +14 -0
- 2024-09-23/08-40-08/wandb/debug.log +26 -0
- 2024-09-23/08-40-08/wandb/run-20240923_091422-a2kxhd8v/files/config.yaml +114 -0
- 2024-09-23/08-40-08/wandb/run-20240923_091422-a2kxhd8v/files/output.log +3 -0
- 2024-09-23/08-40-08/wandb/run-20240923_091422-a2kxhd8v/files/requirements.txt +121 -0
- 2024-09-23/08-40-08/wandb/run-20240923_091422-a2kxhd8v/files/wandb-metadata.json +88 -0
- 2024-09-23/08-40-08/wandb/run-20240923_091422-a2kxhd8v/files/wandb-summary.json +1 -0
- 2024-09-23/08-40-08/wandb/run-20240923_091422-a2kxhd8v/logs/debug-core.log +12 -0
- 2024-09-23/08-40-08/wandb/run-20240923_091422-a2kxhd8v/logs/debug-internal.log +14 -0
- 2024-09-23/08-40-08/wandb/run-20240923_091422-a2kxhd8v/logs/debug.log +26 -0
- 2024-09-23/08-40-08/wandb/run-20240923_091422-a2kxhd8v/run-a2kxhd8v.wandb +0 -0
- 2024-09-23/09-32-28/.hydra/config.yaml +74 -0
- 2024-09-23/09-32-28/.hydra/hydra.yaml +154 -0
- 2024-09-23/09-32-28/.hydra/overrides.yaml +1 -0
- 2024-09-23/09-32-28/train.log +0 -0
- 2024-09-23/09-32-28/wandb/debug-internal.log +18 -0
- 2024-09-23/09-32-28/wandb/debug.log +26 -0
- 2024-09-23/09-32-28/wandb/run-20240923_093237-tkkvzfon/files/config.yaml +115 -0
- 2024-09-23/09-32-28/wandb/run-20240923_093237-tkkvzfon/files/output.log +33 -0
- 2024-09-23/09-32-28/wandb/run-20240923_093237-tkkvzfon/files/wandb-metadata.json +88 -0
- 2024-09-23/09-32-28/wandb/run-20240923_093237-tkkvzfon/files/wandb-summary.json +1 -0
- 2024-09-23/09-32-28/wandb/run-20240923_093237-tkkvzfon/logs/debug-core.log +13 -0
- 2024-09-23/09-32-28/wandb/run-20240923_093237-tkkvzfon/logs/debug-internal.log +18 -0
- 2024-09-23/09-32-28/wandb/run-20240923_093237-tkkvzfon/logs/debug.log +26 -0
- 2024-09-23/09-32-28/wandb/run-20240923_093237-tkkvzfon/run-tkkvzfon.wandb +0 -0
- 2024-09-23/09-33-58/.hydra/config.yaml +74 -0
- 2024-09-23/09-33-58/.hydra/hydra.yaml +154 -0
- 2024-09-23/09-33-58/.hydra/overrides.yaml +1 -0
- 2024-09-23/09-33-58/checkpoints/ckpt_1000.pt +3 -0
- 2024-09-23/09-33-58/checkpoints/ckpt_2000.pt +3 -0
- 2024-09-23/09-33-58/checkpoints/ckpt_3000.pt +3 -0
- 2024-09-23/09-33-58/checkpoints/ckpt_4000.pt +3 -0
- 2024-09-23/09-33-58/checkpoints/ckpt_5000.pt +3 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
2024-09-23/09-33-58/wandb/run-20240923_093407-jnzzkcth/run-jnzzkcth.wandb filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
2024-09-23/15-02-55/wandb/run-20240923_150304-bbl5fd2u/run-bbl5fd2u.wandb filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
2024-09-23/15-28-03/wandb/run-20240923_152812-jp82yqcj/run-jp82yqcj.wandb filter=lfs diff=lfs merge=lfs -text
|
2024-09-23/06-36-18/.hydra/config.yaml
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
experimental:
|
| 2 |
+
model:
|
| 3 |
+
core_model_type: pass_through
|
| 4 |
+
hidden_dim: 384
|
| 5 |
+
byte_hidden: 128
|
| 6 |
+
max_chunk_length: 12
|
| 7 |
+
max_num_chunks: 1024
|
| 8 |
+
num_delimiter_layers: 3
|
| 9 |
+
num_byte_decoder_layers: 5
|
| 10 |
+
target_chunk_len: 8.0
|
| 11 |
+
chunk_len_loss_weight: 0.1
|
| 12 |
+
chunk_len_penalty: 0.1
|
| 13 |
+
context_window: 8192
|
| 14 |
+
embedding_model_type: byte_level
|
| 15 |
+
tokenizer_type: bpe
|
| 16 |
+
tokenizer_dataset_name: simple_en_wiki
|
| 17 |
+
tokenizer_simplify_data: true
|
| 18 |
+
vocab_size: 259
|
| 19 |
+
lm_head_type: byte_level
|
| 20 |
+
lm_head_normalization: rms_norm
|
| 21 |
+
lm_head_bias: false
|
| 22 |
+
lm_head_dropout: 0.0
|
| 23 |
+
model_shell_type: byte_autoencoder_shell
|
| 24 |
+
embedding_weight_tying: true
|
| 25 |
+
ffn_weight_tying: false
|
| 26 |
+
cproj_weight_tying: false
|
| 27 |
+
positional_encoding_type: rope
|
| 28 |
+
trainer:
|
| 29 |
+
trainer_type: base_trainer
|
| 30 |
+
dataset: fineweb_edu_10B
|
| 31 |
+
batch_size: 6
|
| 32 |
+
gradient_accumulation_steps: 8
|
| 33 |
+
max_iters: 10000
|
| 34 |
+
eval_interval: 50000000
|
| 35 |
+
log_interval: 1
|
| 36 |
+
checkpoint_interval: 1000
|
| 37 |
+
eval_iters: 1000
|
| 38 |
+
run_eval: false
|
| 39 |
+
eval:
|
| 40 |
+
mcq_benchmarks: null
|
| 41 |
+
mcq_num_samples: 1000
|
| 42 |
+
eval_byte_metrics: false
|
| 43 |
+
text_modeling_eval: false
|
| 44 |
+
text_generation_eval: false
|
| 45 |
+
optimizer:
|
| 46 |
+
optimizer_name: adamW
|
| 47 |
+
lr: 0.0005
|
| 48 |
+
min_lr: 5.0e-05
|
| 49 |
+
weight_decay: 0.01
|
| 50 |
+
beta1: 0.9
|
| 51 |
+
beta2: 0.95
|
| 52 |
+
grad_clip: 1.0
|
| 53 |
+
lr_scheduler:
|
| 54 |
+
name: cosine
|
| 55 |
+
warmup_iters: 100
|
| 56 |
+
dataloader:
|
| 57 |
+
name: autoencoder
|
| 58 |
+
datasampling:
|
| 59 |
+
name: standard
|
| 60 |
+
loss_fn:
|
| 61 |
+
name: pass_through
|
| 62 |
+
general:
|
| 63 |
+
logging:
|
| 64 |
+
wandb_log: true
|
| 65 |
+
wandb_project: SuperTinyLanguageModels
|
| 66 |
+
wandb_run_name: null
|
| 67 |
+
group_name: experimental_byte_level
|
| 68 |
+
paths:
|
| 69 |
+
output_dir: outputs
|
| 70 |
+
data_dir: data
|
| 71 |
+
checkpoint_dir: checkpoints
|
| 72 |
+
eval_dir: evals
|
| 73 |
+
seed: 489
|
| 74 |
+
device: cuda
|
2024-09-23/06-36-18/.hydra/hydra.yaml
ADDED
|
@@ -0,0 +1,154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
hydra:
|
| 2 |
+
run:
|
| 3 |
+
dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
|
| 4 |
+
sweep:
|
| 5 |
+
dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
|
| 6 |
+
subdir: ${hydra.job.num}
|
| 7 |
+
launcher:
|
| 8 |
+
_target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
|
| 9 |
+
sweeper:
|
| 10 |
+
_target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
|
| 11 |
+
max_batch_size: null
|
| 12 |
+
params: null
|
| 13 |
+
help:
|
| 14 |
+
app_name: ${hydra.job.name}
|
| 15 |
+
header: '${hydra.help.app_name} is powered by Hydra.
|
| 16 |
+
|
| 17 |
+
'
|
| 18 |
+
footer: 'Powered by Hydra (https://hydra.cc)
|
| 19 |
+
|
| 20 |
+
Use --hydra-help to view Hydra specific help
|
| 21 |
+
|
| 22 |
+
'
|
| 23 |
+
template: '${hydra.help.header}
|
| 24 |
+
|
| 25 |
+
== Configuration groups ==
|
| 26 |
+
|
| 27 |
+
Compose your configuration from those groups (group=option)
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
$APP_CONFIG_GROUPS
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
== Config ==
|
| 34 |
+
|
| 35 |
+
Override anything in the config (foo.bar=value)
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
$CONFIG
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
${hydra.help.footer}
|
| 42 |
+
|
| 43 |
+
'
|
| 44 |
+
hydra_help:
|
| 45 |
+
template: 'Hydra (${hydra.runtime.version})
|
| 46 |
+
|
| 47 |
+
See https://hydra.cc for more info.
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
== Flags ==
|
| 51 |
+
|
| 52 |
+
$FLAGS_HELP
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
== Configuration groups ==
|
| 56 |
+
|
| 57 |
+
Compose your configuration from those groups (For example, append hydra/job_logging=disabled
|
| 58 |
+
to command line)
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
$HYDRA_CONFIG_GROUPS
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
Use ''--cfg hydra'' to Show the Hydra config.
|
| 65 |
+
|
| 66 |
+
'
|
| 67 |
+
hydra_help: ???
|
| 68 |
+
hydra_logging:
|
| 69 |
+
version: 1
|
| 70 |
+
formatters:
|
| 71 |
+
simple:
|
| 72 |
+
format: '[%(asctime)s][HYDRA] %(message)s'
|
| 73 |
+
handlers:
|
| 74 |
+
console:
|
| 75 |
+
class: logging.StreamHandler
|
| 76 |
+
formatter: simple
|
| 77 |
+
stream: ext://sys.stdout
|
| 78 |
+
root:
|
| 79 |
+
level: INFO
|
| 80 |
+
handlers:
|
| 81 |
+
- console
|
| 82 |
+
loggers:
|
| 83 |
+
logging_example:
|
| 84 |
+
level: DEBUG
|
| 85 |
+
disable_existing_loggers: false
|
| 86 |
+
job_logging:
|
| 87 |
+
version: 1
|
| 88 |
+
formatters:
|
| 89 |
+
simple:
|
| 90 |
+
format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
|
| 91 |
+
handlers:
|
| 92 |
+
console:
|
| 93 |
+
class: logging.StreamHandler
|
| 94 |
+
formatter: simple
|
| 95 |
+
stream: ext://sys.stdout
|
| 96 |
+
file:
|
| 97 |
+
class: logging.FileHandler
|
| 98 |
+
formatter: simple
|
| 99 |
+
filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
|
| 100 |
+
root:
|
| 101 |
+
level: INFO
|
| 102 |
+
handlers:
|
| 103 |
+
- console
|
| 104 |
+
- file
|
| 105 |
+
disable_existing_loggers: false
|
| 106 |
+
env: {}
|
| 107 |
+
mode: RUN
|
| 108 |
+
searchpath: []
|
| 109 |
+
callbacks: {}
|
| 110 |
+
output_subdir: .hydra
|
| 111 |
+
overrides:
|
| 112 |
+
hydra:
|
| 113 |
+
- hydra.mode=RUN
|
| 114 |
+
task: []
|
| 115 |
+
job:
|
| 116 |
+
name: train
|
| 117 |
+
chdir: null
|
| 118 |
+
override_dirname: ''
|
| 119 |
+
id: ???
|
| 120 |
+
num: ???
|
| 121 |
+
config_name: experimental/byte_autoencoder_1
|
| 122 |
+
env_set: {}
|
| 123 |
+
env_copy: []
|
| 124 |
+
config:
|
| 125 |
+
override_dirname:
|
| 126 |
+
kv_sep: '='
|
| 127 |
+
item_sep: ','
|
| 128 |
+
exclude_keys: []
|
| 129 |
+
runtime:
|
| 130 |
+
version: 1.3.2
|
| 131 |
+
version_base: '1.1'
|
| 132 |
+
cwd: /root/SuperTinyLanguageModels
|
| 133 |
+
config_sources:
|
| 134 |
+
- path: hydra.conf
|
| 135 |
+
schema: pkg
|
| 136 |
+
provider: hydra
|
| 137 |
+
- path: /root/SuperTinyLanguageModels/configs/train
|
| 138 |
+
schema: file
|
| 139 |
+
provider: main
|
| 140 |
+
- path: ''
|
| 141 |
+
schema: structured
|
| 142 |
+
provider: schema
|
| 143 |
+
output_dir: /root/SuperTinyLanguageModels/outputs/2024-09-23/06-36-18
|
| 144 |
+
choices:
|
| 145 |
+
hydra/env: default
|
| 146 |
+
hydra/callbacks: null
|
| 147 |
+
hydra/job_logging: default
|
| 148 |
+
hydra/hydra_logging: default
|
| 149 |
+
hydra/hydra_help: default
|
| 150 |
+
hydra/help: default
|
| 151 |
+
hydra/sweeper: basic
|
| 152 |
+
hydra/launcher: basic
|
| 153 |
+
hydra/output: default
|
| 154 |
+
verbose: false
|
2024-09-23/06-36-18/.hydra/overrides.yaml
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
[]
|
2024-09-23/06-36-18/train.log
ADDED
|
File without changes
|
2024-09-23/07-06-14/.hydra/config.yaml
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
experimental:
|
| 2 |
+
model:
|
| 3 |
+
core_model_type: pass_through
|
| 4 |
+
hidden_dim: 384
|
| 5 |
+
byte_hidden: 128
|
| 6 |
+
max_chunk_length: 12
|
| 7 |
+
max_num_chunks: 1024
|
| 8 |
+
num_delimiter_layers: 3
|
| 9 |
+
num_byte_decoder_layers: 5
|
| 10 |
+
target_chunk_len: 8.0
|
| 11 |
+
chunk_len_loss_weight: 0.1
|
| 12 |
+
chunk_len_penalty: 0.1
|
| 13 |
+
context_window: 8192
|
| 14 |
+
embedding_model_type: byte_level
|
| 15 |
+
tokenizer_type: bpe
|
| 16 |
+
tokenizer_dataset_name: simple_en_wiki
|
| 17 |
+
tokenizer_simplify_data: true
|
| 18 |
+
vocab_size: 259
|
| 19 |
+
lm_head_type: byte_level
|
| 20 |
+
lm_head_normalization: rms_norm
|
| 21 |
+
lm_head_bias: false
|
| 22 |
+
lm_head_dropout: 0.0
|
| 23 |
+
model_shell_type: byte_autoencoder_shell
|
| 24 |
+
embedding_weight_tying: true
|
| 25 |
+
ffn_weight_tying: false
|
| 26 |
+
cproj_weight_tying: false
|
| 27 |
+
positional_encoding_type: rope
|
| 28 |
+
trainer:
|
| 29 |
+
trainer_type: base_trainer
|
| 30 |
+
dataset: fineweb_edu_10B
|
| 31 |
+
batch_size: 6
|
| 32 |
+
gradient_accumulation_steps: 8
|
| 33 |
+
max_iters: 10000
|
| 34 |
+
eval_interval: 50000000
|
| 35 |
+
log_interval: 1
|
| 36 |
+
checkpoint_interval: 1000
|
| 37 |
+
eval_iters: 1000
|
| 38 |
+
run_eval: false
|
| 39 |
+
eval:
|
| 40 |
+
mcq_benchmarks: null
|
| 41 |
+
mcq_num_samples: 1000
|
| 42 |
+
eval_byte_metrics: false
|
| 43 |
+
text_modeling_eval: false
|
| 44 |
+
text_generation_eval: false
|
| 45 |
+
optimizer:
|
| 46 |
+
optimizer_name: adamW
|
| 47 |
+
lr: 0.0005
|
| 48 |
+
min_lr: 5.0e-05
|
| 49 |
+
weight_decay: 0.01
|
| 50 |
+
beta1: 0.9
|
| 51 |
+
beta2: 0.95
|
| 52 |
+
grad_clip: 1.0
|
| 53 |
+
lr_scheduler:
|
| 54 |
+
name: cosine
|
| 55 |
+
warmup_iters: 100
|
| 56 |
+
dataloader:
|
| 57 |
+
name: autoencoder
|
| 58 |
+
datasampling:
|
| 59 |
+
name: standard
|
| 60 |
+
loss_fn:
|
| 61 |
+
name: pass_through
|
| 62 |
+
general:
|
| 63 |
+
logging:
|
| 64 |
+
wandb_log: true
|
| 65 |
+
wandb_project: SuperTinyLanguageModels
|
| 66 |
+
wandb_run_name: null
|
| 67 |
+
group_name: experimental_byte_level
|
| 68 |
+
paths:
|
| 69 |
+
output_dir: outputs
|
| 70 |
+
data_dir: data
|
| 71 |
+
checkpoint_dir: checkpoints
|
| 72 |
+
eval_dir: evals
|
| 73 |
+
seed: 489
|
| 74 |
+
device: cuda
|
2024-09-23/07-06-14/.hydra/hydra.yaml
ADDED
|
@@ -0,0 +1,154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
hydra:
|
| 2 |
+
run:
|
| 3 |
+
dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
|
| 4 |
+
sweep:
|
| 5 |
+
dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
|
| 6 |
+
subdir: ${hydra.job.num}
|
| 7 |
+
launcher:
|
| 8 |
+
_target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
|
| 9 |
+
sweeper:
|
| 10 |
+
_target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
|
| 11 |
+
max_batch_size: null
|
| 12 |
+
params: null
|
| 13 |
+
help:
|
| 14 |
+
app_name: ${hydra.job.name}
|
| 15 |
+
header: '${hydra.help.app_name} is powered by Hydra.
|
| 16 |
+
|
| 17 |
+
'
|
| 18 |
+
footer: 'Powered by Hydra (https://hydra.cc)
|
| 19 |
+
|
| 20 |
+
Use --hydra-help to view Hydra specific help
|
| 21 |
+
|
| 22 |
+
'
|
| 23 |
+
template: '${hydra.help.header}
|
| 24 |
+
|
| 25 |
+
== Configuration groups ==
|
| 26 |
+
|
| 27 |
+
Compose your configuration from those groups (group=option)
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
$APP_CONFIG_GROUPS
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
== Config ==
|
| 34 |
+
|
| 35 |
+
Override anything in the config (foo.bar=value)
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
$CONFIG
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
${hydra.help.footer}
|
| 42 |
+
|
| 43 |
+
'
|
| 44 |
+
hydra_help:
|
| 45 |
+
template: 'Hydra (${hydra.runtime.version})
|
| 46 |
+
|
| 47 |
+
See https://hydra.cc for more info.
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
== Flags ==
|
| 51 |
+
|
| 52 |
+
$FLAGS_HELP
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
== Configuration groups ==
|
| 56 |
+
|
| 57 |
+
Compose your configuration from those groups (For example, append hydra/job_logging=disabled
|
| 58 |
+
to command line)
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
$HYDRA_CONFIG_GROUPS
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
Use ''--cfg hydra'' to Show the Hydra config.
|
| 65 |
+
|
| 66 |
+
'
|
| 67 |
+
hydra_help: ???
|
| 68 |
+
hydra_logging:
|
| 69 |
+
version: 1
|
| 70 |
+
formatters:
|
| 71 |
+
simple:
|
| 72 |
+
format: '[%(asctime)s][HYDRA] %(message)s'
|
| 73 |
+
handlers:
|
| 74 |
+
console:
|
| 75 |
+
class: logging.StreamHandler
|
| 76 |
+
formatter: simple
|
| 77 |
+
stream: ext://sys.stdout
|
| 78 |
+
root:
|
| 79 |
+
level: INFO
|
| 80 |
+
handlers:
|
| 81 |
+
- console
|
| 82 |
+
loggers:
|
| 83 |
+
logging_example:
|
| 84 |
+
level: DEBUG
|
| 85 |
+
disable_existing_loggers: false
|
| 86 |
+
job_logging:
|
| 87 |
+
version: 1
|
| 88 |
+
formatters:
|
| 89 |
+
simple:
|
| 90 |
+
format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
|
| 91 |
+
handlers:
|
| 92 |
+
console:
|
| 93 |
+
class: logging.StreamHandler
|
| 94 |
+
formatter: simple
|
| 95 |
+
stream: ext://sys.stdout
|
| 96 |
+
file:
|
| 97 |
+
class: logging.FileHandler
|
| 98 |
+
formatter: simple
|
| 99 |
+
filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
|
| 100 |
+
root:
|
| 101 |
+
level: INFO
|
| 102 |
+
handlers:
|
| 103 |
+
- console
|
| 104 |
+
- file
|
| 105 |
+
disable_existing_loggers: false
|
| 106 |
+
env: {}
|
| 107 |
+
mode: RUN
|
| 108 |
+
searchpath: []
|
| 109 |
+
callbacks: {}
|
| 110 |
+
output_subdir: .hydra
|
| 111 |
+
overrides:
|
| 112 |
+
hydra:
|
| 113 |
+
- hydra.mode=RUN
|
| 114 |
+
task: []
|
| 115 |
+
job:
|
| 116 |
+
name: train
|
| 117 |
+
chdir: null
|
| 118 |
+
override_dirname: ''
|
| 119 |
+
id: ???
|
| 120 |
+
num: ???
|
| 121 |
+
config_name: experimental/byte_autoencoder_1
|
| 122 |
+
env_set: {}
|
| 123 |
+
env_copy: []
|
| 124 |
+
config:
|
| 125 |
+
override_dirname:
|
| 126 |
+
kv_sep: '='
|
| 127 |
+
item_sep: ','
|
| 128 |
+
exclude_keys: []
|
| 129 |
+
runtime:
|
| 130 |
+
version: 1.3.2
|
| 131 |
+
version_base: '1.1'
|
| 132 |
+
cwd: /root/SuperTinyLanguageModels
|
| 133 |
+
config_sources:
|
| 134 |
+
- path: hydra.conf
|
| 135 |
+
schema: pkg
|
| 136 |
+
provider: hydra
|
| 137 |
+
- path: /root/SuperTinyLanguageModels/configs/train
|
| 138 |
+
schema: file
|
| 139 |
+
provider: main
|
| 140 |
+
- path: ''
|
| 141 |
+
schema: structured
|
| 142 |
+
provider: schema
|
| 143 |
+
output_dir: /root/SuperTinyLanguageModels/outputs/2024-09-23/07-06-14
|
| 144 |
+
choices:
|
| 145 |
+
hydra/env: default
|
| 146 |
+
hydra/callbacks: null
|
| 147 |
+
hydra/job_logging: default
|
| 148 |
+
hydra/hydra_logging: default
|
| 149 |
+
hydra/hydra_help: default
|
| 150 |
+
hydra/help: default
|
| 151 |
+
hydra/sweeper: basic
|
| 152 |
+
hydra/launcher: basic
|
| 153 |
+
hydra/output: default
|
| 154 |
+
verbose: false
|
2024-09-23/07-06-14/.hydra/overrides.yaml
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
[]
|
2024-09-23/07-06-14/train.log
ADDED
|
File without changes
|
2024-09-23/08-39-13/.hydra/config.yaml
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
experimental:
|
| 2 |
+
model:
|
| 3 |
+
core_model_type: pass_through
|
| 4 |
+
hidden_dim: 384
|
| 5 |
+
byte_hidden: 128
|
| 6 |
+
max_chunk_length: 12
|
| 7 |
+
max_num_chunks: 1024
|
| 8 |
+
num_delimiter_layers: 3
|
| 9 |
+
num_byte_decoder_layers: 5
|
| 10 |
+
target_chunk_len: 8.0
|
| 11 |
+
chunk_len_loss_weight: 0.1
|
| 12 |
+
chunk_len_penalty: 0.1
|
| 13 |
+
context_window: 8192
|
| 14 |
+
embedding_model_type: byte_level
|
| 15 |
+
tokenizer_type: bpe
|
| 16 |
+
tokenizer_dataset_name: simple_en_wiki
|
| 17 |
+
tokenizer_simplify_data: true
|
| 18 |
+
vocab_size: 259
|
| 19 |
+
lm_head_type: byte_level
|
| 20 |
+
lm_head_normalization: rms_norm
|
| 21 |
+
lm_head_bias: false
|
| 22 |
+
lm_head_dropout: 0.0
|
| 23 |
+
model_shell_type: byte_autoencoder_shell
|
| 24 |
+
embedding_weight_tying: true
|
| 25 |
+
ffn_weight_tying: false
|
| 26 |
+
cproj_weight_tying: false
|
| 27 |
+
positional_encoding_type: rope
|
| 28 |
+
trainer:
|
| 29 |
+
trainer_type: base_trainer
|
| 30 |
+
dataset: fineweb_edu_10B
|
| 31 |
+
batch_size: 6
|
| 32 |
+
gradient_accumulation_steps: 8
|
| 33 |
+
max_iters: 10000
|
| 34 |
+
eval_interval: 50000000
|
| 35 |
+
log_interval: 1
|
| 36 |
+
checkpoint_interval: 1000
|
| 37 |
+
eval_iters: 1000
|
| 38 |
+
run_eval: false
|
| 39 |
+
eval:
|
| 40 |
+
mcq_benchmarks: null
|
| 41 |
+
mcq_num_samples: 1000
|
| 42 |
+
eval_byte_metrics: false
|
| 43 |
+
text_modeling_eval: false
|
| 44 |
+
text_generation_eval: false
|
| 45 |
+
optimizer:
|
| 46 |
+
optimizer_name: adamW
|
| 47 |
+
lr: 0.0005
|
| 48 |
+
min_lr: 5.0e-05
|
| 49 |
+
weight_decay: 0.01
|
| 50 |
+
beta1: 0.9
|
| 51 |
+
beta2: 0.95
|
| 52 |
+
grad_clip: 1.0
|
| 53 |
+
lr_scheduler:
|
| 54 |
+
name: cosine
|
| 55 |
+
warmup_iters: 100
|
| 56 |
+
dataloader:
|
| 57 |
+
name: autoencoder
|
| 58 |
+
datasampling:
|
| 59 |
+
name: standard
|
| 60 |
+
loss_fn:
|
| 61 |
+
name: pass_through
|
| 62 |
+
general:
|
| 63 |
+
logging:
|
| 64 |
+
wandb_log: true
|
| 65 |
+
wandb_project: SuperTinyLanguageModels
|
| 66 |
+
wandb_run_name: null
|
| 67 |
+
group_name: experimental_byte_level
|
| 68 |
+
paths:
|
| 69 |
+
output_dir: outputs
|
| 70 |
+
data_dir: data
|
| 71 |
+
checkpoint_dir: checkpoints
|
| 72 |
+
eval_dir: evals
|
| 73 |
+
seed: 489
|
| 74 |
+
device: cuda
|
2024-09-23/08-39-13/.hydra/hydra.yaml
ADDED
|
@@ -0,0 +1,154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
hydra:
|
| 2 |
+
run:
|
| 3 |
+
dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
|
| 4 |
+
sweep:
|
| 5 |
+
dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
|
| 6 |
+
subdir: ${hydra.job.num}
|
| 7 |
+
launcher:
|
| 8 |
+
_target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
|
| 9 |
+
sweeper:
|
| 10 |
+
_target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
|
| 11 |
+
max_batch_size: null
|
| 12 |
+
params: null
|
| 13 |
+
help:
|
| 14 |
+
app_name: ${hydra.job.name}
|
| 15 |
+
header: '${hydra.help.app_name} is powered by Hydra.
|
| 16 |
+
|
| 17 |
+
'
|
| 18 |
+
footer: 'Powered by Hydra (https://hydra.cc)
|
| 19 |
+
|
| 20 |
+
Use --hydra-help to view Hydra specific help
|
| 21 |
+
|
| 22 |
+
'
|
| 23 |
+
template: '${hydra.help.header}
|
| 24 |
+
|
| 25 |
+
== Configuration groups ==
|
| 26 |
+
|
| 27 |
+
Compose your configuration from those groups (group=option)
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
$APP_CONFIG_GROUPS
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
== Config ==
|
| 34 |
+
|
| 35 |
+
Override anything in the config (foo.bar=value)
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
$CONFIG
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
${hydra.help.footer}
|
| 42 |
+
|
| 43 |
+
'
|
| 44 |
+
hydra_help:
|
| 45 |
+
template: 'Hydra (${hydra.runtime.version})
|
| 46 |
+
|
| 47 |
+
See https://hydra.cc for more info.
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
== Flags ==
|
| 51 |
+
|
| 52 |
+
$FLAGS_HELP
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
== Configuration groups ==
|
| 56 |
+
|
| 57 |
+
Compose your configuration from those groups (For example, append hydra/job_logging=disabled
|
| 58 |
+
to command line)
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
$HYDRA_CONFIG_GROUPS
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
Use ''--cfg hydra'' to Show the Hydra config.
|
| 65 |
+
|
| 66 |
+
'
|
| 67 |
+
hydra_help: ???
|
| 68 |
+
hydra_logging:
|
| 69 |
+
version: 1
|
| 70 |
+
formatters:
|
| 71 |
+
simple:
|
| 72 |
+
format: '[%(asctime)s][HYDRA] %(message)s'
|
| 73 |
+
handlers:
|
| 74 |
+
console:
|
| 75 |
+
class: logging.StreamHandler
|
| 76 |
+
formatter: simple
|
| 77 |
+
stream: ext://sys.stdout
|
| 78 |
+
root:
|
| 79 |
+
level: INFO
|
| 80 |
+
handlers:
|
| 81 |
+
- console
|
| 82 |
+
loggers:
|
| 83 |
+
logging_example:
|
| 84 |
+
level: DEBUG
|
| 85 |
+
disable_existing_loggers: false
|
| 86 |
+
job_logging:
|
| 87 |
+
version: 1
|
| 88 |
+
formatters:
|
| 89 |
+
simple:
|
| 90 |
+
format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
|
| 91 |
+
handlers:
|
| 92 |
+
console:
|
| 93 |
+
class: logging.StreamHandler
|
| 94 |
+
formatter: simple
|
| 95 |
+
stream: ext://sys.stdout
|
| 96 |
+
file:
|
| 97 |
+
class: logging.FileHandler
|
| 98 |
+
formatter: simple
|
| 99 |
+
filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
|
| 100 |
+
root:
|
| 101 |
+
level: INFO
|
| 102 |
+
handlers:
|
| 103 |
+
- console
|
| 104 |
+
- file
|
| 105 |
+
disable_existing_loggers: false
|
| 106 |
+
env: {}
|
| 107 |
+
mode: RUN
|
| 108 |
+
searchpath: []
|
| 109 |
+
callbacks: {}
|
| 110 |
+
output_subdir: .hydra
|
| 111 |
+
overrides:
|
| 112 |
+
hydra:
|
| 113 |
+
- hydra.mode=RUN
|
| 114 |
+
task: []
|
| 115 |
+
job:
|
| 116 |
+
name: train
|
| 117 |
+
chdir: null
|
| 118 |
+
override_dirname: ''
|
| 119 |
+
id: ???
|
| 120 |
+
num: ???
|
| 121 |
+
config_name: experimental/byte_autoencoder_1
|
| 122 |
+
env_set: {}
|
| 123 |
+
env_copy: []
|
| 124 |
+
config:
|
| 125 |
+
override_dirname:
|
| 126 |
+
kv_sep: '='
|
| 127 |
+
item_sep: ','
|
| 128 |
+
exclude_keys: []
|
| 129 |
+
runtime:
|
| 130 |
+
version: 1.3.2
|
| 131 |
+
version_base: '1.1'
|
| 132 |
+
cwd: /root/SuperTinyLanguageModels
|
| 133 |
+
config_sources:
|
| 134 |
+
- path: hydra.conf
|
| 135 |
+
schema: pkg
|
| 136 |
+
provider: hydra
|
| 137 |
+
- path: /root/SuperTinyLanguageModels/configs/train
|
| 138 |
+
schema: file
|
| 139 |
+
provider: main
|
| 140 |
+
- path: ''
|
| 141 |
+
schema: structured
|
| 142 |
+
provider: schema
|
| 143 |
+
output_dir: /root/SuperTinyLanguageModels/outputs/2024-09-23/08-39-13
|
| 144 |
+
choices:
|
| 145 |
+
hydra/env: default
|
| 146 |
+
hydra/callbacks: null
|
| 147 |
+
hydra/job_logging: default
|
| 148 |
+
hydra/hydra_logging: default
|
| 149 |
+
hydra/hydra_help: default
|
| 150 |
+
hydra/help: default
|
| 151 |
+
hydra/sweeper: basic
|
| 152 |
+
hydra/launcher: basic
|
| 153 |
+
hydra/output: default
|
| 154 |
+
verbose: false
|
2024-09-23/08-39-13/.hydra/overrides.yaml
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
[]
|
2024-09-23/08-39-13/train.log
ADDED
|
File without changes
|
2024-09-23/08-40-08/.hydra/config.yaml
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
experimental:
|
| 2 |
+
model:
|
| 3 |
+
core_model_type: pass_through
|
| 4 |
+
hidden_dim: 384
|
| 5 |
+
byte_hidden: 128
|
| 6 |
+
max_chunk_length: 12
|
| 7 |
+
max_num_chunks: 1024
|
| 8 |
+
num_delimiter_layers: 3
|
| 9 |
+
num_byte_decoder_layers: 5
|
| 10 |
+
target_chunk_len: 8.0
|
| 11 |
+
chunk_len_loss_weight: 0.1
|
| 12 |
+
chunk_len_penalty: 0.1
|
| 13 |
+
context_window: 8192
|
| 14 |
+
embedding_model_type: byte_level
|
| 15 |
+
tokenizer_type: bpe
|
| 16 |
+
tokenizer_dataset_name: simple_en_wiki
|
| 17 |
+
tokenizer_simplify_data: true
|
| 18 |
+
vocab_size: 259
|
| 19 |
+
lm_head_type: byte_level
|
| 20 |
+
lm_head_normalization: rms_norm
|
| 21 |
+
lm_head_bias: false
|
| 22 |
+
lm_head_dropout: 0.0
|
| 23 |
+
model_shell_type: byte_autoencoder_shell
|
| 24 |
+
embedding_weight_tying: true
|
| 25 |
+
ffn_weight_tying: false
|
| 26 |
+
cproj_weight_tying: false
|
| 27 |
+
positional_encoding_type: rope
|
| 28 |
+
trainer:
|
| 29 |
+
trainer_type: base_trainer
|
| 30 |
+
dataset: fineweb_edu_10B
|
| 31 |
+
batch_size: 6
|
| 32 |
+
gradient_accumulation_steps: 8
|
| 33 |
+
max_iters: 10000
|
| 34 |
+
eval_interval: 50000000
|
| 35 |
+
log_interval: 1
|
| 36 |
+
checkpoint_interval: 1000
|
| 37 |
+
eval_iters: 1000
|
| 38 |
+
run_eval: false
|
| 39 |
+
eval:
|
| 40 |
+
mcq_benchmarks: null
|
| 41 |
+
mcq_num_samples: 1000
|
| 42 |
+
eval_byte_metrics: false
|
| 43 |
+
text_modeling_eval: false
|
| 44 |
+
text_generation_eval: false
|
| 45 |
+
optimizer:
|
| 46 |
+
optimizer_name: adamW
|
| 47 |
+
lr: 0.0005
|
| 48 |
+
min_lr: 5.0e-05
|
| 49 |
+
weight_decay: 0.01
|
| 50 |
+
beta1: 0.9
|
| 51 |
+
beta2: 0.95
|
| 52 |
+
grad_clip: 1.0
|
| 53 |
+
lr_scheduler:
|
| 54 |
+
name: cosine
|
| 55 |
+
warmup_iters: 100
|
| 56 |
+
dataloader:
|
| 57 |
+
name: autoencoder
|
| 58 |
+
datasampling:
|
| 59 |
+
name: standard
|
| 60 |
+
loss_fn:
|
| 61 |
+
name: pass_through
|
| 62 |
+
general:
|
| 63 |
+
logging:
|
| 64 |
+
wandb_log: true
|
| 65 |
+
wandb_project: SuperTinyLanguageModels
|
| 66 |
+
wandb_run_name: null
|
| 67 |
+
group_name: experimental_byte_level
|
| 68 |
+
paths:
|
| 69 |
+
output_dir: outputs
|
| 70 |
+
data_dir: data
|
| 71 |
+
checkpoint_dir: checkpoints
|
| 72 |
+
eval_dir: evals
|
| 73 |
+
seed: 489
|
| 74 |
+
device: cuda
|
2024-09-23/08-40-08/.hydra/hydra.yaml
ADDED
|
@@ -0,0 +1,154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
hydra:
|
| 2 |
+
run:
|
| 3 |
+
dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
|
| 4 |
+
sweep:
|
| 5 |
+
dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
|
| 6 |
+
subdir: ${hydra.job.num}
|
| 7 |
+
launcher:
|
| 8 |
+
_target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
|
| 9 |
+
sweeper:
|
| 10 |
+
_target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
|
| 11 |
+
max_batch_size: null
|
| 12 |
+
params: null
|
| 13 |
+
help:
|
| 14 |
+
app_name: ${hydra.job.name}
|
| 15 |
+
header: '${hydra.help.app_name} is powered by Hydra.
|
| 16 |
+
|
| 17 |
+
'
|
| 18 |
+
footer: 'Powered by Hydra (https://hydra.cc)
|
| 19 |
+
|
| 20 |
+
Use --hydra-help to view Hydra specific help
|
| 21 |
+
|
| 22 |
+
'
|
| 23 |
+
template: '${hydra.help.header}
|
| 24 |
+
|
| 25 |
+
== Configuration groups ==
|
| 26 |
+
|
| 27 |
+
Compose your configuration from those groups (group=option)
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
$APP_CONFIG_GROUPS
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
== Config ==
|
| 34 |
+
|
| 35 |
+
Override anything in the config (foo.bar=value)
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
$CONFIG
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
${hydra.help.footer}
|
| 42 |
+
|
| 43 |
+
'
|
| 44 |
+
hydra_help:
|
| 45 |
+
template: 'Hydra (${hydra.runtime.version})
|
| 46 |
+
|
| 47 |
+
See https://hydra.cc for more info.
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
== Flags ==
|
| 51 |
+
|
| 52 |
+
$FLAGS_HELP
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
== Configuration groups ==
|
| 56 |
+
|
| 57 |
+
Compose your configuration from those groups (For example, append hydra/job_logging=disabled
|
| 58 |
+
to command line)
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
$HYDRA_CONFIG_GROUPS
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
Use ''--cfg hydra'' to Show the Hydra config.
|
| 65 |
+
|
| 66 |
+
'
|
| 67 |
+
hydra_help: ???
|
| 68 |
+
hydra_logging:
|
| 69 |
+
version: 1
|
| 70 |
+
formatters:
|
| 71 |
+
simple:
|
| 72 |
+
format: '[%(asctime)s][HYDRA] %(message)s'
|
| 73 |
+
handlers:
|
| 74 |
+
console:
|
| 75 |
+
class: logging.StreamHandler
|
| 76 |
+
formatter: simple
|
| 77 |
+
stream: ext://sys.stdout
|
| 78 |
+
root:
|
| 79 |
+
level: INFO
|
| 80 |
+
handlers:
|
| 81 |
+
- console
|
| 82 |
+
loggers:
|
| 83 |
+
logging_example:
|
| 84 |
+
level: DEBUG
|
| 85 |
+
disable_existing_loggers: false
|
| 86 |
+
job_logging:
|
| 87 |
+
version: 1
|
| 88 |
+
formatters:
|
| 89 |
+
simple:
|
| 90 |
+
format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
|
| 91 |
+
handlers:
|
| 92 |
+
console:
|
| 93 |
+
class: logging.StreamHandler
|
| 94 |
+
formatter: simple
|
| 95 |
+
stream: ext://sys.stdout
|
| 96 |
+
file:
|
| 97 |
+
class: logging.FileHandler
|
| 98 |
+
formatter: simple
|
| 99 |
+
filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
|
| 100 |
+
root:
|
| 101 |
+
level: INFO
|
| 102 |
+
handlers:
|
| 103 |
+
- console
|
| 104 |
+
- file
|
| 105 |
+
disable_existing_loggers: false
|
| 106 |
+
env: {}
|
| 107 |
+
mode: RUN
|
| 108 |
+
searchpath: []
|
| 109 |
+
callbacks: {}
|
| 110 |
+
output_subdir: .hydra
|
| 111 |
+
overrides:
|
| 112 |
+
hydra:
|
| 113 |
+
- hydra.mode=RUN
|
| 114 |
+
task: []
|
| 115 |
+
job:
|
| 116 |
+
name: train
|
| 117 |
+
chdir: null
|
| 118 |
+
override_dirname: ''
|
| 119 |
+
id: ???
|
| 120 |
+
num: ???
|
| 121 |
+
config_name: experimental/byte_autoencoder_1
|
| 122 |
+
env_set: {}
|
| 123 |
+
env_copy: []
|
| 124 |
+
config:
|
| 125 |
+
override_dirname:
|
| 126 |
+
kv_sep: '='
|
| 127 |
+
item_sep: ','
|
| 128 |
+
exclude_keys: []
|
| 129 |
+
runtime:
|
| 130 |
+
version: 1.3.2
|
| 131 |
+
version_base: '1.1'
|
| 132 |
+
cwd: /root/SuperTinyLanguageModels
|
| 133 |
+
config_sources:
|
| 134 |
+
- path: hydra.conf
|
| 135 |
+
schema: pkg
|
| 136 |
+
provider: hydra
|
| 137 |
+
- path: /root/SuperTinyLanguageModels/configs/train
|
| 138 |
+
schema: file
|
| 139 |
+
provider: main
|
| 140 |
+
- path: ''
|
| 141 |
+
schema: structured
|
| 142 |
+
provider: schema
|
| 143 |
+
output_dir: /root/SuperTinyLanguageModels/outputs/2024-09-23/08-40-08
|
| 144 |
+
choices:
|
| 145 |
+
hydra/env: default
|
| 146 |
+
hydra/callbacks: null
|
| 147 |
+
hydra/job_logging: default
|
| 148 |
+
hydra/hydra_logging: default
|
| 149 |
+
hydra/hydra_help: default
|
| 150 |
+
hydra/help: default
|
| 151 |
+
hydra/sweeper: basic
|
| 152 |
+
hydra/launcher: basic
|
| 153 |
+
hydra/output: default
|
| 154 |
+
verbose: false
|
2024-09-23/08-40-08/.hydra/overrides.yaml
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
[]
|
2024-09-23/08-40-08/train.log
ADDED
|
File without changes
|
2024-09-23/08-40-08/wandb/debug-internal.log
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2024-09-23T09:14:22.59580271Z","level":"INFO","msg":"using version","core version":"0.18.1"}
|
| 2 |
+
{"time":"2024-09-23T09:14:22.59581747Z","level":"INFO","msg":"created symlink","path":"/root/SuperTinyLanguageModels/outputs/2024-09-23/08-40-08/wandb/run-20240923_091422-a2kxhd8v/logs/debug-core.log"}
|
| 3 |
+
{"time":"2024-09-23T09:14:22.595881422Z","level":"INFO","msg":"using version","core version":"0.18.1"}
|
| 4 |
+
{"time":"2024-09-23T09:14:22.595887882Z","level":"INFO","msg":"created symlink","path":"/root/SuperTinyLanguageModels/outputs/2024-09-23/08-40-08/wandb/run-20240923_091422-a2kxhd8v/logs/debug-core.log"}
|
| 5 |
+
{"time":"2024-09-23T09:14:22.59917443Z","level":"INFO","msg":"created new stream","id":"a2kxhd8v"}
|
| 6 |
+
{"time":"2024-09-23T09:14:22.59919309Z","level":"INFO","msg":"stream: started","id":"a2kxhd8v"}
|
| 7 |
+
{"time":"2024-09-23T09:14:22.59921417Z","level":"INFO","msg":"sender: started","stream_id":{"value":"a2kxhd8v"}}
|
| 8 |
+
{"time":"2024-09-23T09:14:22.599226691Z","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"a2kxhd8v"}}
|
| 9 |
+
{"time":"2024-09-23T09:14:22.599236461Z","level":"INFO","msg":"handler: started","stream_id":{"value":"a2kxhd8v"}}
|
| 10 |
+
{"time":"2024-09-23T09:14:22.982350736Z","level":"INFO","msg":"wandb-core","!BADKEY":null}
|
| 11 |
+
{"time":"2024-09-23T09:14:22.985015444Z","level":"INFO","msg":"Starting system monitor"}
|
| 12 |
+
{"time":"2024-09-23T09:14:27.10372121Z","level":"INFO","msg":"stream: closing","id":"a2kxhd8v"}
|
| 13 |
+
{"time":"2024-09-23T09:14:27.103806442Z","level":"INFO","msg":"Stopping system monitor"}
|
| 14 |
+
{"time":"2024-09-23T09:14:27.104964992Z","level":"INFO","msg":"Stopped system monitor"}
|
2024-09-23/08-40-08/wandb/debug.log
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2024-09-23 09:14:22,582 INFO MainThread:78108 [wandb_setup.py:_flush():77] Current SDK version is 0.18.1
|
| 2 |
+
2024-09-23 09:14:22,582 INFO MainThread:78108 [wandb_setup.py:_flush():77] Configure stats pid to 78108
|
| 3 |
+
2024-09-23 09:14:22,582 INFO MainThread:78108 [wandb_setup.py:_flush():77] Loading settings from /root/.config/wandb/settings
|
| 4 |
+
2024-09-23 09:14:22,583 INFO MainThread:78108 [wandb_setup.py:_flush():77] Loading settings from /root/SuperTinyLanguageModels/outputs/2024-09-23/08-40-08/wandb/settings
|
| 5 |
+
2024-09-23 09:14:22,583 INFO MainThread:78108 [wandb_setup.py:_flush():77] Loading settings from environment variables: {}
|
| 6 |
+
2024-09-23 09:14:22,583 INFO MainThread:78108 [wandb_setup.py:_flush():77] Applying setup settings: {'mode': None, '_disable_service': None}
|
| 7 |
+
2024-09-23 09:14:22,583 INFO MainThread:78108 [wandb_setup.py:_flush():77] Inferring run settings from compute environment: {'program_relpath': 'train.py', 'program_abspath': '/root/SuperTinyLanguageModels/train.py', 'program': '/root/SuperTinyLanguageModels/train.py'}
|
| 8 |
+
2024-09-23 09:14:22,583 INFO MainThread:78108 [wandb_setup.py:_flush():77] Applying login settings: {}
|
| 9 |
+
2024-09-23 09:14:22,583 INFO MainThread:78108 [wandb_init.py:_log_setup():532] Logging user logs to /root/SuperTinyLanguageModels/outputs/2024-09-23/08-40-08/wandb/run-20240923_091422-a2kxhd8v/logs/debug.log
|
| 10 |
+
2024-09-23 09:14:22,583 INFO MainThread:78108 [wandb_init.py:_log_setup():533] Logging internal logs to /root/SuperTinyLanguageModels/outputs/2024-09-23/08-40-08/wandb/run-20240923_091422-a2kxhd8v/logs/debug-internal.log
|
| 11 |
+
2024-09-23 09:14:22,583 INFO MainThread:78108 [wandb_init.py:init():616] calling init triggers
|
| 12 |
+
2024-09-23 09:14:22,583 INFO MainThread:78108 [wandb_init.py:init():623] wandb.init called with sweep_config: {}
|
| 13 |
+
config: {'model': {'core_model_type': 'pass_through', 'hidden_dim': 384, 'byte_hidden': 128, 'max_chunk_length': 12, 'max_num_chunks': 1024, 'num_delimiter_layers': 3, 'num_byte_decoder_layers': 5, 'target_chunk_len': 8.0, 'chunk_len_loss_weight': 0.1, 'chunk_len_penalty': 0.1, 'context_window': 8192, 'embedding_model_type': 'byte_level', 'tokenizer_type': 'bpe', 'tokenizer_dataset_name': 'simple_en_wiki', 'tokenizer_simplify_data': True, 'vocab_size': 259, 'lm_head_type': 'byte_level', 'lm_head_normalization': 'rms_norm', 'lm_head_bias': False, 'lm_head_dropout': 0.0, 'model_shell_type': 'byte_autoencoder_shell', 'embedding_weight_tying': True, 'ffn_weight_tying': False, 'cproj_weight_tying': False, 'positional_encoding_type': 'rope'}, 'trainer': {'trainer_type': 'base_trainer', 'dataset': 'fineweb_edu_10B', 'batch_size': 6, 'gradient_accumulation_steps': 8, 'max_iters': 10000, 'eval_interval': 50000000, 'log_interval': 1, 'checkpoint_interval': 1000, 'eval_iters': 1000, 'run_eval': False, 'eval': {'mcq_benchmarks': None, 'mcq_num_samples': 1000, 'eval_byte_metrics': False, 'text_modeling_eval': False, 'text_generation_eval': False}, 'optimizer': {'optimizer_name': 'adamW', 'lr': 0.0005, 'min_lr': 5e-05, 'weight_decay': 0.01, 'beta1': 0.9, 'beta2': 0.95, 'grad_clip': 1.0}, 'lr_scheduler': {'name': 'cosine', 'warmup_iters': 100}, 'dataloader': {'name': 'autoencoder'}, 'datasampling': {'name': 'standard'}, 'loss_fn': {'name': 'pass_through'}}, 'general': {'logging': {'wandb_log': True, 'wandb_project': 'SuperTinyLanguageModels', 'wandb_run_name': None, 'group_name': 'experimental_byte_level'}, 'paths': {'output_dir': 'outputs', 'data_dir': '/root/SuperTinyLanguageModels/data', 'checkpoint_dir': 'checkpoints', 'eval_dir': '/root/SuperTinyLanguageModels/evals'}, 'seed': 489, 'device': 'cuda'}}
|
| 14 |
+
2024-09-23 09:14:22,583 INFO MainThread:78108 [wandb_init.py:init():666] starting backend
|
| 15 |
+
2024-09-23 09:14:22,583 INFO MainThread:78108 [wandb_init.py:init():670] setting up manager
|
| 16 |
+
2024-09-23 09:14:22,584 INFO MainThread:78108 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
| 17 |
+
2024-09-23 09:14:22,586 INFO MainThread:78108 [wandb_init.py:init():678] backend started and connected
|
| 18 |
+
2024-09-23 09:14:22,588 INFO MainThread:78108 [wandb_init.py:init():773] updated telemetry
|
| 19 |
+
2024-09-23 09:14:22,598 INFO MainThread:78108 [wandb_init.py:init():806] communicating run to backend with 90.0 second timeout
|
| 20 |
+
2024-09-23 09:14:22,974 INFO MainThread:78108 [wandb_init.py:init():857] starting run threads in backend
|
| 21 |
+
2024-09-23 09:14:23,128 INFO MainThread:78108 [wandb_run.py:_console_start():2459] atexit reg
|
| 22 |
+
2024-09-23 09:14:23,128 INFO MainThread:78108 [wandb_run.py:_redirect():2307] redirect: wrap_raw
|
| 23 |
+
2024-09-23 09:14:23,129 INFO MainThread:78108 [wandb_run.py:_redirect():2372] Wrapping output streams.
|
| 24 |
+
2024-09-23 09:14:23,129 INFO MainThread:78108 [wandb_run.py:_redirect():2397] Redirects installed.
|
| 25 |
+
2024-09-23 09:14:23,135 INFO MainThread:78108 [wandb_init.py:init():900] run started, returning control to user process
|
| 26 |
+
2024-09-23 09:14:27,104 WARNING MsgRouterThr:78108 [router.py:message_loop():77] message_loop has been closed
|
2024-09-23/08-40-08/wandb/run-20240923_091422-a2kxhd8v/files/config.yaml
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
_wandb:
|
| 2 |
+
value:
|
| 3 |
+
cli_version: 0.18.1
|
| 4 |
+
m: []
|
| 5 |
+
python_version: 3.10.14
|
| 6 |
+
t:
|
| 7 |
+
"1":
|
| 8 |
+
- 1
|
| 9 |
+
- 5
|
| 10 |
+
- 11
|
| 11 |
+
- 49
|
| 12 |
+
- 50
|
| 13 |
+
- 51
|
| 14 |
+
- 53
|
| 15 |
+
- 55
|
| 16 |
+
"2":
|
| 17 |
+
- 1
|
| 18 |
+
- 5
|
| 19 |
+
- 11
|
| 20 |
+
- 49
|
| 21 |
+
- 50
|
| 22 |
+
- 51
|
| 23 |
+
- 53
|
| 24 |
+
- 55
|
| 25 |
+
"3":
|
| 26 |
+
- 13
|
| 27 |
+
- 15
|
| 28 |
+
- 16
|
| 29 |
+
- 23
|
| 30 |
+
- 55
|
| 31 |
+
"4": 3.10.14
|
| 32 |
+
"5": 0.18.1
|
| 33 |
+
"6": 4.44.2
|
| 34 |
+
"8":
|
| 35 |
+
- 5
|
| 36 |
+
- 9
|
| 37 |
+
"12": 0.18.1
|
| 38 |
+
"13": linux-x86_64
|
| 39 |
+
general:
|
| 40 |
+
value:
|
| 41 |
+
device: cuda
|
| 42 |
+
logging:
|
| 43 |
+
group_name: experimental_byte_level
|
| 44 |
+
wandb_log: true
|
| 45 |
+
wandb_project: SuperTinyLanguageModels
|
| 46 |
+
wandb_run_name: null
|
| 47 |
+
paths:
|
| 48 |
+
checkpoint_dir: checkpoints
|
| 49 |
+
data_dir: /root/SuperTinyLanguageModels/data
|
| 50 |
+
eval_dir: /root/SuperTinyLanguageModels/evals
|
| 51 |
+
output_dir: outputs
|
| 52 |
+
seed: 489
|
| 53 |
+
model:
|
| 54 |
+
value:
|
| 55 |
+
byte_hidden: 128
|
| 56 |
+
chunk_len_loss_weight: 0.1
|
| 57 |
+
chunk_len_penalty: 0.1
|
| 58 |
+
context_window: 8192
|
| 59 |
+
core_model_type: pass_through
|
| 60 |
+
cproj_weight_tying: false
|
| 61 |
+
embedding_model_type: byte_level
|
| 62 |
+
embedding_weight_tying: true
|
| 63 |
+
ffn_weight_tying: false
|
| 64 |
+
hidden_dim: 384
|
| 65 |
+
lm_head_bias: false
|
| 66 |
+
lm_head_dropout: 0
|
| 67 |
+
lm_head_normalization: rms_norm
|
| 68 |
+
lm_head_type: byte_level
|
| 69 |
+
max_chunk_length: 12
|
| 70 |
+
max_num_chunks: 1024
|
| 71 |
+
model_shell_type: byte_autoencoder_shell
|
| 72 |
+
num_byte_decoder_layers: 5
|
| 73 |
+
num_delimiter_layers: 3
|
| 74 |
+
positional_encoding_type: rope
|
| 75 |
+
target_chunk_len: 8
|
| 76 |
+
tokenizer_dataset_name: simple_en_wiki
|
| 77 |
+
tokenizer_simplify_data: true
|
| 78 |
+
tokenizer_type: bpe
|
| 79 |
+
vocab_size: 259
|
| 80 |
+
trainer:
|
| 81 |
+
value:
|
| 82 |
+
batch_size: 6
|
| 83 |
+
checkpoint_interval: 1000
|
| 84 |
+
dataloader:
|
| 85 |
+
name: autoencoder
|
| 86 |
+
datasampling:
|
| 87 |
+
name: standard
|
| 88 |
+
dataset: fineweb_edu_10B
|
| 89 |
+
eval:
|
| 90 |
+
eval_byte_metrics: false
|
| 91 |
+
mcq_benchmarks: null
|
| 92 |
+
mcq_num_samples: 1000
|
| 93 |
+
text_generation_eval: false
|
| 94 |
+
text_modeling_eval: false
|
| 95 |
+
eval_interval: 50000000
|
| 96 |
+
eval_iters: 1000
|
| 97 |
+
gradient_accumulation_steps: 8
|
| 98 |
+
log_interval: 1
|
| 99 |
+
loss_fn:
|
| 100 |
+
name: pass_through
|
| 101 |
+
lr_scheduler:
|
| 102 |
+
name: cosine
|
| 103 |
+
warmup_iters: 100
|
| 104 |
+
max_iters: 10000
|
| 105 |
+
optimizer:
|
| 106 |
+
beta1: 0.9
|
| 107 |
+
beta2: 0.95
|
| 108 |
+
grad_clip: 1
|
| 109 |
+
lr: 0.0005
|
| 110 |
+
min_lr: 5e-05
|
| 111 |
+
optimizer_name: adamW
|
| 112 |
+
weight_decay: 0.01
|
| 113 |
+
run_eval: false
|
| 114 |
+
trainer_type: base_trainer
|
2024-09-23/08-40-08/wandb/run-20240923_091422-a2kxhd8v/files/output.log
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Weight and Biases Initialized
|
| 2 |
+
Rank0 Trainer built
|
| 3 |
+
Training loop is starting
|
2024-09-23/08-40-08/wandb/run-20240923_091422-a2kxhd8v/files/requirements.txt
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
setuptools==75.1.0
|
| 2 |
+
wheel==0.44.0
|
| 3 |
+
pip==24.2
|
| 4 |
+
wcwidth==0.2.13
|
| 5 |
+
sentencepiece==0.2.0
|
| 6 |
+
pytz==2024.2
|
| 7 |
+
mpmath==1.3.0
|
| 8 |
+
distlib==0.3.8
|
| 9 |
+
antlr4-python3-runtime==4.9.3
|
| 10 |
+
xxhash==3.5.0
|
| 11 |
+
urllib3==2.2.3
|
| 12 |
+
tzdata==2024.1
|
| 13 |
+
typing_extensions==4.12.2
|
| 14 |
+
tqdm==4.66.5
|
| 15 |
+
threadpoolctl==3.5.0
|
| 16 |
+
sympy==1.13.3
|
| 17 |
+
smmap==5.0.1
|
| 18 |
+
six==1.16.0
|
| 19 |
+
setproctitle==1.3.3
|
| 20 |
+
safetensors==0.4.5
|
| 21 |
+
regex==2024.9.11
|
| 22 |
+
rapidfuzz==3.9.7
|
| 23 |
+
PyYAML==6.0.2
|
| 24 |
+
pytrec-eval-terrier==0.5.6
|
| 25 |
+
pyphen==0.16.0
|
| 26 |
+
Pygments==2.18.0
|
| 27 |
+
psutil==6.0.0
|
| 28 |
+
protobuf==5.28.2
|
| 29 |
+
prettytable==3.11.0
|
| 30 |
+
polars==1.7.1
|
| 31 |
+
platformdirs==4.3.6
|
| 32 |
+
pillow==10.4.0
|
| 33 |
+
packaging==24.1
|
| 34 |
+
nvidia-nvtx-cu12==12.1.105
|
| 35 |
+
nvidia-nvjitlink-cu12==12.6.68
|
| 36 |
+
nvidia-nccl-cu12==2.20.5
|
| 37 |
+
nvidia-curand-cu12==10.3.2.106
|
| 38 |
+
nvidia-cufft-cu12==11.0.2.54
|
| 39 |
+
nvidia-cuda-runtime-cu12==12.1.105
|
| 40 |
+
nvidia-cuda-nvrtc-cu12==12.1.105
|
| 41 |
+
nvidia-cuda-cupti-cu12==12.1.105
|
| 42 |
+
nvidia-cublas-cu12==12.1.3.1
|
| 43 |
+
numpy==1.26.4
|
| 44 |
+
nodeenv==1.9.1
|
| 45 |
+
networkx==3.3
|
| 46 |
+
mdurl==0.1.2
|
| 47 |
+
MarkupSafe==2.1.5
|
| 48 |
+
joblib==1.4.2
|
| 49 |
+
idna==3.10
|
| 50 |
+
identify==2.6.1
|
| 51 |
+
fsspec==2024.6.1
|
| 52 |
+
frozenlist==1.4.1
|
| 53 |
+
filelock==3.16.1
|
| 54 |
+
eval_type_backport==0.2.0
|
| 55 |
+
dill==0.3.8
|
| 56 |
+
click==8.1.7
|
| 57 |
+
charset-normalizer==3.3.2
|
| 58 |
+
cfgv==3.4.0
|
| 59 |
+
certifi==2024.8.30
|
| 60 |
+
attrs==24.2.0
|
| 61 |
+
async-timeout==4.0.3
|
| 62 |
+
annotated-types==0.7.0
|
| 63 |
+
aiohappyeyeballs==2.4.0
|
| 64 |
+
virtualenv==20.26.5
|
| 65 |
+
triton==3.0.0
|
| 66 |
+
textstat==0.7.4
|
| 67 |
+
sentry-sdk==2.14.0
|
| 68 |
+
scipy==1.14.1
|
| 69 |
+
requests==2.32.3
|
| 70 |
+
python-dateutil==2.9.0.post0
|
| 71 |
+
pydantic_core==2.23.4
|
| 72 |
+
pyarrow==17.0.0
|
| 73 |
+
omegaconf==2.3.0
|
| 74 |
+
nvidia-cusparse-cu12==12.1.0.106
|
| 75 |
+
nvidia-cudnn-cu12==9.1.0.70
|
| 76 |
+
nltk==3.9.1
|
| 77 |
+
multiprocess==0.70.16
|
| 78 |
+
multidict==6.1.0
|
| 79 |
+
markdown-it-py==3.0.0
|
| 80 |
+
Levenshtein==0.26.0
|
| 81 |
+
Jinja2==3.1.4
|
| 82 |
+
gitdb==4.0.11
|
| 83 |
+
docker-pycreds==0.4.0
|
| 84 |
+
aiosignal==1.3.1
|
| 85 |
+
yarl==1.11.1
|
| 86 |
+
tiktoken==0.7.0
|
| 87 |
+
scikit-learn==1.5.2
|
| 88 |
+
rich==13.8.1
|
| 89 |
+
pydantic==2.9.2
|
| 90 |
+
pre-commit==3.8.0
|
| 91 |
+
pandas==2.2.3
|
| 92 |
+
nvidia-cusolver-cu12==11.4.5.107
|
| 93 |
+
language_tool_python==2.8.1
|
| 94 |
+
hydra-core==1.3.2
|
| 95 |
+
huggingface-hub==0.25.0
|
| 96 |
+
GitPython==3.1.43
|
| 97 |
+
wandb==0.18.1
|
| 98 |
+
torch==2.4.1
|
| 99 |
+
tokenizers==0.19.1
|
| 100 |
+
aiohttp==3.10.5
|
| 101 |
+
transformers==4.44.2
|
| 102 |
+
sentence-transformers==3.1.1
|
| 103 |
+
datasets==3.0.0
|
| 104 |
+
mteb==1.14.21
|
| 105 |
+
autocommand==2.2.2
|
| 106 |
+
backports.tarfile==1.2.0
|
| 107 |
+
importlib_metadata==8.0.0
|
| 108 |
+
importlib_resources==6.4.0
|
| 109 |
+
inflect==7.3.1
|
| 110 |
+
jaraco.collections==5.1.0
|
| 111 |
+
jaraco.context==5.3.0
|
| 112 |
+
jaraco.functools==4.0.1
|
| 113 |
+
jaraco.text==3.12.1
|
| 114 |
+
more-itertools==10.3.0
|
| 115 |
+
packaging==24.1
|
| 116 |
+
platformdirs==4.2.2
|
| 117 |
+
tomli==2.0.1
|
| 118 |
+
typeguard==4.3.0
|
| 119 |
+
typing_extensions==4.12.2
|
| 120 |
+
wheel==0.43.0
|
| 121 |
+
zipp==3.19.2
|
2024-09-23/08-40-08/wandb/run-20240923_091422-a2kxhd8v/files/wandb-metadata.json
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"os": "Linux-5.15.0-117-generic-x86_64-with-glibc2.31",
|
| 3 |
+
"python": "3.10.14",
|
| 4 |
+
"startedAt": "2024-09-23T09:14:22.586171Z",
|
| 5 |
+
"args": [
|
| 6 |
+
"--config-name",
|
| 7 |
+
"experimental/byte_autoencoder_1"
|
| 8 |
+
],
|
| 9 |
+
"program": "/root/SuperTinyLanguageModels/train.py",
|
| 10 |
+
"codePath": "train.py",
|
| 11 |
+
"git": {
|
| 12 |
+
"remote": "https://github.com/LeonGuertler/SuperTinyLanguageModels.git",
|
| 13 |
+
"commit": "ebdf9039e89c5d337997d0c2b11bf4e992886243"
|
| 14 |
+
},
|
| 15 |
+
"email": "calvin14@gmail.com",
|
| 16 |
+
"root": "/root/SuperTinyLanguageModels/outputs/2024-09-23/08-40-08",
|
| 17 |
+
"host": "11c6e13f6a55",
|
| 18 |
+
"username": "root",
|
| 19 |
+
"executable": "/root/SuperTinyLanguageModels/.conda/bin/python3",
|
| 20 |
+
"cpu_count": 128,
|
| 21 |
+
"cpu_count_logical": 256,
|
| 22 |
+
"gpu": "[NVIDIA GeForce RTX 4090, NVIDIA GeForce RTX 4090, NVIDIA GeForce RTX 4090, NVIDIA GeForce RTX 4090, NVIDIA GeForce RTX 4090, NVIDIA GeForce RTX 4090, NVIDIA GeForce RTX 4090, NVIDIA GeForce RTX 4090]",
|
| 23 |
+
"gpu_count": 8,
|
| 24 |
+
"disk": {
|
| 25 |
+
"/": {
|
| 26 |
+
"total": "1123133947904",
|
| 27 |
+
"used": "551794225152"
|
| 28 |
+
}
|
| 29 |
+
},
|
| 30 |
+
"memory": {
|
| 31 |
+
"total": "540812599296"
|
| 32 |
+
},
|
| 33 |
+
"cpu": {
|
| 34 |
+
"count": 128,
|
| 35 |
+
"countLogical": 256
|
| 36 |
+
},
|
| 37 |
+
"gpu_nvidia": [
|
| 38 |
+
{
|
| 39 |
+
"name": "NVIDIA GeForce RTX 4090",
|
| 40 |
+
"memoryTotal": "25757220864",
|
| 41 |
+
"cudaCores": 16384,
|
| 42 |
+
"architecture": "Ada"
|
| 43 |
+
},
|
| 44 |
+
{
|
| 45 |
+
"name": "NVIDIA GeForce RTX 4090",
|
| 46 |
+
"memoryTotal": "25757220864",
|
| 47 |
+
"cudaCores": 16384,
|
| 48 |
+
"architecture": "Ada"
|
| 49 |
+
},
|
| 50 |
+
{
|
| 51 |
+
"name": "NVIDIA GeForce RTX 4090",
|
| 52 |
+
"memoryTotal": "25757220864",
|
| 53 |
+
"cudaCores": 16384,
|
| 54 |
+
"architecture": "Ada"
|
| 55 |
+
},
|
| 56 |
+
{
|
| 57 |
+
"name": "NVIDIA GeForce RTX 4090",
|
| 58 |
+
"memoryTotal": "25757220864",
|
| 59 |
+
"cudaCores": 16384,
|
| 60 |
+
"architecture": "Ada"
|
| 61 |
+
},
|
| 62 |
+
{
|
| 63 |
+
"name": "NVIDIA GeForce RTX 4090",
|
| 64 |
+
"memoryTotal": "25757220864",
|
| 65 |
+
"cudaCores": 16384,
|
| 66 |
+
"architecture": "Ada"
|
| 67 |
+
},
|
| 68 |
+
{
|
| 69 |
+
"name": "NVIDIA GeForce RTX 4090",
|
| 70 |
+
"memoryTotal": "25757220864",
|
| 71 |
+
"cudaCores": 16384,
|
| 72 |
+
"architecture": "Ada"
|
| 73 |
+
},
|
| 74 |
+
{
|
| 75 |
+
"name": "NVIDIA GeForce RTX 4090",
|
| 76 |
+
"memoryTotal": "25757220864",
|
| 77 |
+
"cudaCores": 16384,
|
| 78 |
+
"architecture": "Ada"
|
| 79 |
+
},
|
| 80 |
+
{
|
| 81 |
+
"name": "NVIDIA GeForce RTX 4090",
|
| 82 |
+
"memoryTotal": "25757220864",
|
| 83 |
+
"cudaCores": 16384,
|
| 84 |
+
"architecture": "Ada"
|
| 85 |
+
}
|
| 86 |
+
],
|
| 87 |
+
"cudaVersion": "12.5"
|
| 88 |
+
}
|
2024-09-23/08-40-08/wandb/run-20240923_091422-a2kxhd8v/files/wandb-summary.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"_wandb":{"runtime":4}}
|
2024-09-23/08-40-08/wandb/run-20240923_091422-a2kxhd8v/logs/debug-core.log
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2024-09-23T09:14:21.933081362Z","level":"INFO","msg":"started logging, with flags","port-filename":"/tmp/tmp9hgpve6u/port-78108.txt","pid":78108,"debug":false,"disable-analytics":false}
|
| 2 |
+
{"time":"2024-09-23T09:14:21.933136193Z","level":"INFO","msg":"FeatureState","shutdownOnParentExitEnabled":false}
|
| 3 |
+
{"time":"2024-09-23T09:14:21.935284221Z","level":"INFO","msg":"Will exit if parent process dies.","ppid":78108}
|
| 4 |
+
{"time":"2024-09-23T09:14:21.935348272Z","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":43153,"Zone":""}}
|
| 5 |
+
{"time":"2024-09-23T09:14:22.076126266Z","level":"INFO","msg":"created new connection","id":"127.0.0.1:57616"}
|
| 6 |
+
{"time":"2024-09-23T09:14:22.595626377Z","level":"INFO","msg":"connection init received","streamId":"a2kxhd8v","id":"127.0.0.1:57616"}
|
| 7 |
+
{"time":"2024-09-23T09:14:22.595853241Z","level":"ERROR","msg":"error creating symlink","error":"symlink /root/.cache/wandb/logs/core-debug-20240923_091421.log /root/SuperTinyLanguageModels/outputs/2024-09-23/08-40-08/wandb/run-20240923_091422-a2kxhd8v/logs/debug-core.log: file exists"}
|
| 8 |
+
{"time":"2024-09-23T09:14:22.59919809Z","level":"INFO","msg":"connection init completed","streamId":"a2kxhd8v","id":"127.0.0.1:57616"}
|
| 9 |
+
{"time":"2024-09-23T09:14:27.103590738Z","level":"INFO","msg":"connection: teardown","id":"127.0.0.1:57616"}
|
| 10 |
+
{"time":"2024-09-23T09:14:27.103797162Z","level":"INFO","msg":"server is shutting down"}
|
| 11 |
+
{"time":"2024-09-23T09:14:27.104072727Z","level":"INFO","msg":"closed connection","id":"127.0.0.1:57616"}
|
| 12 |
+
{"time":"2024-09-23T09:14:28.465863147Z","level":"INFO","msg":"Parent process exited, terminating service process."}
|
2024-09-23/08-40-08/wandb/run-20240923_091422-a2kxhd8v/logs/debug-internal.log
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2024-09-23T09:14:22.59580271Z","level":"INFO","msg":"using version","core version":"0.18.1"}
|
| 2 |
+
{"time":"2024-09-23T09:14:22.59581747Z","level":"INFO","msg":"created symlink","path":"/root/SuperTinyLanguageModels/outputs/2024-09-23/08-40-08/wandb/run-20240923_091422-a2kxhd8v/logs/debug-core.log"}
|
| 3 |
+
{"time":"2024-09-23T09:14:22.595881422Z","level":"INFO","msg":"using version","core version":"0.18.1"}
|
| 4 |
+
{"time":"2024-09-23T09:14:22.595887882Z","level":"INFO","msg":"created symlink","path":"/root/SuperTinyLanguageModels/outputs/2024-09-23/08-40-08/wandb/run-20240923_091422-a2kxhd8v/logs/debug-core.log"}
|
| 5 |
+
{"time":"2024-09-23T09:14:22.59917443Z","level":"INFO","msg":"created new stream","id":"a2kxhd8v"}
|
| 6 |
+
{"time":"2024-09-23T09:14:22.59919309Z","level":"INFO","msg":"stream: started","id":"a2kxhd8v"}
|
| 7 |
+
{"time":"2024-09-23T09:14:22.59921417Z","level":"INFO","msg":"sender: started","stream_id":{"value":"a2kxhd8v"}}
|
| 8 |
+
{"time":"2024-09-23T09:14:22.599226691Z","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"a2kxhd8v"}}
|
| 9 |
+
{"time":"2024-09-23T09:14:22.599236461Z","level":"INFO","msg":"handler: started","stream_id":{"value":"a2kxhd8v"}}
|
| 10 |
+
{"time":"2024-09-23T09:14:22.982350736Z","level":"INFO","msg":"wandb-core","!BADKEY":null}
|
| 11 |
+
{"time":"2024-09-23T09:14:22.985015444Z","level":"INFO","msg":"Starting system monitor"}
|
| 12 |
+
{"time":"2024-09-23T09:14:27.10372121Z","level":"INFO","msg":"stream: closing","id":"a2kxhd8v"}
|
| 13 |
+
{"time":"2024-09-23T09:14:27.103806442Z","level":"INFO","msg":"Stopping system monitor"}
|
| 14 |
+
{"time":"2024-09-23T09:14:27.104964992Z","level":"INFO","msg":"Stopped system monitor"}
|
2024-09-23/08-40-08/wandb/run-20240923_091422-a2kxhd8v/logs/debug.log
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2024-09-23 09:14:22,582 INFO MainThread:78108 [wandb_setup.py:_flush():77] Current SDK version is 0.18.1
|
| 2 |
+
2024-09-23 09:14:22,582 INFO MainThread:78108 [wandb_setup.py:_flush():77] Configure stats pid to 78108
|
| 3 |
+
2024-09-23 09:14:22,582 INFO MainThread:78108 [wandb_setup.py:_flush():77] Loading settings from /root/.config/wandb/settings
|
| 4 |
+
2024-09-23 09:14:22,583 INFO MainThread:78108 [wandb_setup.py:_flush():77] Loading settings from /root/SuperTinyLanguageModels/outputs/2024-09-23/08-40-08/wandb/settings
|
| 5 |
+
2024-09-23 09:14:22,583 INFO MainThread:78108 [wandb_setup.py:_flush():77] Loading settings from environment variables: {}
|
| 6 |
+
2024-09-23 09:14:22,583 INFO MainThread:78108 [wandb_setup.py:_flush():77] Applying setup settings: {'mode': None, '_disable_service': None}
|
| 7 |
+
2024-09-23 09:14:22,583 INFO MainThread:78108 [wandb_setup.py:_flush():77] Inferring run settings from compute environment: {'program_relpath': 'train.py', 'program_abspath': '/root/SuperTinyLanguageModels/train.py', 'program': '/root/SuperTinyLanguageModels/train.py'}
|
| 8 |
+
2024-09-23 09:14:22,583 INFO MainThread:78108 [wandb_setup.py:_flush():77] Applying login settings: {}
|
| 9 |
+
2024-09-23 09:14:22,583 INFO MainThread:78108 [wandb_init.py:_log_setup():532] Logging user logs to /root/SuperTinyLanguageModels/outputs/2024-09-23/08-40-08/wandb/run-20240923_091422-a2kxhd8v/logs/debug.log
|
| 10 |
+
2024-09-23 09:14:22,583 INFO MainThread:78108 [wandb_init.py:_log_setup():533] Logging internal logs to /root/SuperTinyLanguageModels/outputs/2024-09-23/08-40-08/wandb/run-20240923_091422-a2kxhd8v/logs/debug-internal.log
|
| 11 |
+
2024-09-23 09:14:22,583 INFO MainThread:78108 [wandb_init.py:init():616] calling init triggers
|
| 12 |
+
2024-09-23 09:14:22,583 INFO MainThread:78108 [wandb_init.py:init():623] wandb.init called with sweep_config: {}
|
| 13 |
+
config: {'model': {'core_model_type': 'pass_through', 'hidden_dim': 384, 'byte_hidden': 128, 'max_chunk_length': 12, 'max_num_chunks': 1024, 'num_delimiter_layers': 3, 'num_byte_decoder_layers': 5, 'target_chunk_len': 8.0, 'chunk_len_loss_weight': 0.1, 'chunk_len_penalty': 0.1, 'context_window': 8192, 'embedding_model_type': 'byte_level', 'tokenizer_type': 'bpe', 'tokenizer_dataset_name': 'simple_en_wiki', 'tokenizer_simplify_data': True, 'vocab_size': 259, 'lm_head_type': 'byte_level', 'lm_head_normalization': 'rms_norm', 'lm_head_bias': False, 'lm_head_dropout': 0.0, 'model_shell_type': 'byte_autoencoder_shell', 'embedding_weight_tying': True, 'ffn_weight_tying': False, 'cproj_weight_tying': False, 'positional_encoding_type': 'rope'}, 'trainer': {'trainer_type': 'base_trainer', 'dataset': 'fineweb_edu_10B', 'batch_size': 6, 'gradient_accumulation_steps': 8, 'max_iters': 10000, 'eval_interval': 50000000, 'log_interval': 1, 'checkpoint_interval': 1000, 'eval_iters': 1000, 'run_eval': False, 'eval': {'mcq_benchmarks': None, 'mcq_num_samples': 1000, 'eval_byte_metrics': False, 'text_modeling_eval': False, 'text_generation_eval': False}, 'optimizer': {'optimizer_name': 'adamW', 'lr': 0.0005, 'min_lr': 5e-05, 'weight_decay': 0.01, 'beta1': 0.9, 'beta2': 0.95, 'grad_clip': 1.0}, 'lr_scheduler': {'name': 'cosine', 'warmup_iters': 100}, 'dataloader': {'name': 'autoencoder'}, 'datasampling': {'name': 'standard'}, 'loss_fn': {'name': 'pass_through'}}, 'general': {'logging': {'wandb_log': True, 'wandb_project': 'SuperTinyLanguageModels', 'wandb_run_name': None, 'group_name': 'experimental_byte_level'}, 'paths': {'output_dir': 'outputs', 'data_dir': '/root/SuperTinyLanguageModels/data', 'checkpoint_dir': 'checkpoints', 'eval_dir': '/root/SuperTinyLanguageModels/evals'}, 'seed': 489, 'device': 'cuda'}}
|
| 14 |
+
2024-09-23 09:14:22,583 INFO MainThread:78108 [wandb_init.py:init():666] starting backend
|
| 15 |
+
2024-09-23 09:14:22,583 INFO MainThread:78108 [wandb_init.py:init():670] setting up manager
|
| 16 |
+
2024-09-23 09:14:22,584 INFO MainThread:78108 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
| 17 |
+
2024-09-23 09:14:22,586 INFO MainThread:78108 [wandb_init.py:init():678] backend started and connected
|
| 18 |
+
2024-09-23 09:14:22,588 INFO MainThread:78108 [wandb_init.py:init():773] updated telemetry
|
| 19 |
+
2024-09-23 09:14:22,598 INFO MainThread:78108 [wandb_init.py:init():806] communicating run to backend with 90.0 second timeout
|
| 20 |
+
2024-09-23 09:14:22,974 INFO MainThread:78108 [wandb_init.py:init():857] starting run threads in backend
|
| 21 |
+
2024-09-23 09:14:23,128 INFO MainThread:78108 [wandb_run.py:_console_start():2459] atexit reg
|
| 22 |
+
2024-09-23 09:14:23,128 INFO MainThread:78108 [wandb_run.py:_redirect():2307] redirect: wrap_raw
|
| 23 |
+
2024-09-23 09:14:23,129 INFO MainThread:78108 [wandb_run.py:_redirect():2372] Wrapping output streams.
|
| 24 |
+
2024-09-23 09:14:23,129 INFO MainThread:78108 [wandb_run.py:_redirect():2397] Redirects installed.
|
| 25 |
+
2024-09-23 09:14:23,135 INFO MainThread:78108 [wandb_init.py:init():900] run started, returning control to user process
|
| 26 |
+
2024-09-23 09:14:27,104 WARNING MsgRouterThr:78108 [router.py:message_loop():77] message_loop has been closed
|
2024-09-23/08-40-08/wandb/run-20240923_091422-a2kxhd8v/run-a2kxhd8v.wandb
ADDED
|
File without changes
|
2024-09-23/09-32-28/.hydra/config.yaml
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
experimental:
|
| 2 |
+
model:
|
| 3 |
+
core_model_type: pass_through
|
| 4 |
+
hidden_dim: 384
|
| 5 |
+
byte_hidden: 128
|
| 6 |
+
max_chunk_length: 12
|
| 7 |
+
max_num_chunks: 1024
|
| 8 |
+
num_delimiter_layers: 3
|
| 9 |
+
num_byte_decoder_layers: 5
|
| 10 |
+
target_chunk_len: 8.0
|
| 11 |
+
chunk_len_loss_weight: 0.1
|
| 12 |
+
chunk_len_penalty: 0.1
|
| 13 |
+
context_window: 8192
|
| 14 |
+
embedding_model_type: byte_level
|
| 15 |
+
tokenizer_type: bpe
|
| 16 |
+
tokenizer_dataset_name: simple_en_wiki
|
| 17 |
+
tokenizer_simplify_data: true
|
| 18 |
+
vocab_size: 259
|
| 19 |
+
lm_head_type: byte_level
|
| 20 |
+
lm_head_normalization: rms_norm
|
| 21 |
+
lm_head_bias: false
|
| 22 |
+
lm_head_dropout: 0.0
|
| 23 |
+
model_shell_type: byte_autoencoder_shell
|
| 24 |
+
embedding_weight_tying: true
|
| 25 |
+
ffn_weight_tying: false
|
| 26 |
+
cproj_weight_tying: false
|
| 27 |
+
positional_encoding_type: rope
|
| 28 |
+
trainer:
|
| 29 |
+
trainer_type: base_trainer
|
| 30 |
+
dataset: fineweb_edu_10B
|
| 31 |
+
batch_size: 6
|
| 32 |
+
gradient_accumulation_steps: 8
|
| 33 |
+
max_iters: 10000
|
| 34 |
+
eval_interval: 50000000
|
| 35 |
+
log_interval: 1
|
| 36 |
+
checkpoint_interval: 1000
|
| 37 |
+
eval_iters: 1000
|
| 38 |
+
run_eval: false
|
| 39 |
+
eval:
|
| 40 |
+
mcq_benchmarks: null
|
| 41 |
+
mcq_num_samples: 1000
|
| 42 |
+
eval_byte_metrics: false
|
| 43 |
+
text_modeling_eval: false
|
| 44 |
+
text_generation_eval: false
|
| 45 |
+
optimizer:
|
| 46 |
+
optimizer_name: adamW
|
| 47 |
+
lr: 0.0005
|
| 48 |
+
min_lr: 5.0e-05
|
| 49 |
+
weight_decay: 0.01
|
| 50 |
+
beta1: 0.9
|
| 51 |
+
beta2: 0.95
|
| 52 |
+
grad_clip: 1.0
|
| 53 |
+
lr_scheduler:
|
| 54 |
+
name: cosine
|
| 55 |
+
warmup_iters: 100
|
| 56 |
+
dataloader:
|
| 57 |
+
name: autoencoder
|
| 58 |
+
datasampling:
|
| 59 |
+
name: standard
|
| 60 |
+
loss_fn:
|
| 61 |
+
name: pass_through
|
| 62 |
+
general:
|
| 63 |
+
logging:
|
| 64 |
+
wandb_log: true
|
| 65 |
+
wandb_project: SuperTinyLanguageModels
|
| 66 |
+
wandb_run_name: null
|
| 67 |
+
group_name: experimental_byte_level
|
| 68 |
+
paths:
|
| 69 |
+
output_dir: outputs
|
| 70 |
+
data_dir: data
|
| 71 |
+
checkpoint_dir: checkpoints
|
| 72 |
+
eval_dir: evals
|
| 73 |
+
seed: 489
|
| 74 |
+
device: cuda
|
2024-09-23/09-32-28/.hydra/hydra.yaml
ADDED
|
@@ -0,0 +1,154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
hydra:
|
| 2 |
+
run:
|
| 3 |
+
dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
|
| 4 |
+
sweep:
|
| 5 |
+
dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
|
| 6 |
+
subdir: ${hydra.job.num}
|
| 7 |
+
launcher:
|
| 8 |
+
_target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
|
| 9 |
+
sweeper:
|
| 10 |
+
_target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
|
| 11 |
+
max_batch_size: null
|
| 12 |
+
params: null
|
| 13 |
+
help:
|
| 14 |
+
app_name: ${hydra.job.name}
|
| 15 |
+
header: '${hydra.help.app_name} is powered by Hydra.
|
| 16 |
+
|
| 17 |
+
'
|
| 18 |
+
footer: 'Powered by Hydra (https://hydra.cc)
|
| 19 |
+
|
| 20 |
+
Use --hydra-help to view Hydra specific help
|
| 21 |
+
|
| 22 |
+
'
|
| 23 |
+
template: '${hydra.help.header}
|
| 24 |
+
|
| 25 |
+
== Configuration groups ==
|
| 26 |
+
|
| 27 |
+
Compose your configuration from those groups (group=option)
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
$APP_CONFIG_GROUPS
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
== Config ==
|
| 34 |
+
|
| 35 |
+
Override anything in the config (foo.bar=value)
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
$CONFIG
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
${hydra.help.footer}
|
| 42 |
+
|
| 43 |
+
'
|
| 44 |
+
hydra_help:
|
| 45 |
+
template: 'Hydra (${hydra.runtime.version})
|
| 46 |
+
|
| 47 |
+
See https://hydra.cc for more info.
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
== Flags ==
|
| 51 |
+
|
| 52 |
+
$FLAGS_HELP
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
== Configuration groups ==
|
| 56 |
+
|
| 57 |
+
Compose your configuration from those groups (For example, append hydra/job_logging=disabled
|
| 58 |
+
to command line)
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
$HYDRA_CONFIG_GROUPS
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
Use ''--cfg hydra'' to Show the Hydra config.
|
| 65 |
+
|
| 66 |
+
'
|
| 67 |
+
hydra_help: ???
|
| 68 |
+
hydra_logging:
|
| 69 |
+
version: 1
|
| 70 |
+
formatters:
|
| 71 |
+
simple:
|
| 72 |
+
format: '[%(asctime)s][HYDRA] %(message)s'
|
| 73 |
+
handlers:
|
| 74 |
+
console:
|
| 75 |
+
class: logging.StreamHandler
|
| 76 |
+
formatter: simple
|
| 77 |
+
stream: ext://sys.stdout
|
| 78 |
+
root:
|
| 79 |
+
level: INFO
|
| 80 |
+
handlers:
|
| 81 |
+
- console
|
| 82 |
+
loggers:
|
| 83 |
+
logging_example:
|
| 84 |
+
level: DEBUG
|
| 85 |
+
disable_existing_loggers: false
|
| 86 |
+
job_logging:
|
| 87 |
+
version: 1
|
| 88 |
+
formatters:
|
| 89 |
+
simple:
|
| 90 |
+
format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
|
| 91 |
+
handlers:
|
| 92 |
+
console:
|
| 93 |
+
class: logging.StreamHandler
|
| 94 |
+
formatter: simple
|
| 95 |
+
stream: ext://sys.stdout
|
| 96 |
+
file:
|
| 97 |
+
class: logging.FileHandler
|
| 98 |
+
formatter: simple
|
| 99 |
+
filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
|
| 100 |
+
root:
|
| 101 |
+
level: INFO
|
| 102 |
+
handlers:
|
| 103 |
+
- console
|
| 104 |
+
- file
|
| 105 |
+
disable_existing_loggers: false
|
| 106 |
+
env: {}
|
| 107 |
+
mode: RUN
|
| 108 |
+
searchpath: []
|
| 109 |
+
callbacks: {}
|
| 110 |
+
output_subdir: .hydra
|
| 111 |
+
overrides:
|
| 112 |
+
hydra:
|
| 113 |
+
- hydra.mode=RUN
|
| 114 |
+
task: []
|
| 115 |
+
job:
|
| 116 |
+
name: train
|
| 117 |
+
chdir: null
|
| 118 |
+
override_dirname: ''
|
| 119 |
+
id: ???
|
| 120 |
+
num: ???
|
| 121 |
+
config_name: experimental/byte_autoencoder_1
|
| 122 |
+
env_set: {}
|
| 123 |
+
env_copy: []
|
| 124 |
+
config:
|
| 125 |
+
override_dirname:
|
| 126 |
+
kv_sep: '='
|
| 127 |
+
item_sep: ','
|
| 128 |
+
exclude_keys: []
|
| 129 |
+
runtime:
|
| 130 |
+
version: 1.3.2
|
| 131 |
+
version_base: '1.1'
|
| 132 |
+
cwd: /root/SuperTinyLanguageModels
|
| 133 |
+
config_sources:
|
| 134 |
+
- path: hydra.conf
|
| 135 |
+
schema: pkg
|
| 136 |
+
provider: hydra
|
| 137 |
+
- path: /root/SuperTinyLanguageModels/configs/train
|
| 138 |
+
schema: file
|
| 139 |
+
provider: main
|
| 140 |
+
- path: ''
|
| 141 |
+
schema: structured
|
| 142 |
+
provider: schema
|
| 143 |
+
output_dir: /root/SuperTinyLanguageModels/outputs/2024-09-23/09-32-28
|
| 144 |
+
choices:
|
| 145 |
+
hydra/env: default
|
| 146 |
+
hydra/callbacks: null
|
| 147 |
+
hydra/job_logging: default
|
| 148 |
+
hydra/hydra_logging: default
|
| 149 |
+
hydra/hydra_help: default
|
| 150 |
+
hydra/help: default
|
| 151 |
+
hydra/sweeper: basic
|
| 152 |
+
hydra/launcher: basic
|
| 153 |
+
hydra/output: default
|
| 154 |
+
verbose: false
|
2024-09-23/09-32-28/.hydra/overrides.yaml
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
[]
|
2024-09-23/09-32-28/train.log
ADDED
|
File without changes
|
2024-09-23/09-32-28/wandb/debug-internal.log
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2024-09-23T09:32:37.2270228Z","level":"INFO","msg":"using version","core version":"0.18.1"}
|
| 2 |
+
{"time":"2024-09-23T09:32:37.227060611Z","level":"INFO","msg":"created symlink","path":"/root/SuperTinyLanguageModels/outputs/2024-09-23/09-32-28/wandb/run-20240923_093237-tkkvzfon/logs/debug-core.log"}
|
| 3 |
+
{"time":"2024-09-23T09:32:37.227169702Z","level":"INFO","msg":"using version","core version":"0.18.1"}
|
| 4 |
+
{"time":"2024-09-23T09:32:37.227182172Z","level":"INFO","msg":"created symlink","path":"/root/SuperTinyLanguageModels/outputs/2024-09-23/09-32-28/wandb/run-20240923_093237-tkkvzfon/logs/debug-core.log"}
|
| 5 |
+
{"time":"2024-09-23T09:32:37.230824708Z","level":"INFO","msg":"created new stream","id":"tkkvzfon"}
|
| 6 |
+
{"time":"2024-09-23T09:32:37.230859859Z","level":"INFO","msg":"stream: started","id":"tkkvzfon"}
|
| 7 |
+
{"time":"2024-09-23T09:32:37.230903499Z","level":"INFO","msg":"sender: started","stream_id":{"value":"tkkvzfon"}}
|
| 8 |
+
{"time":"2024-09-23T09:32:37.23092371Z","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"tkkvzfon"}}
|
| 9 |
+
{"time":"2024-09-23T09:32:37.23097304Z","level":"INFO","msg":"handler: started","stream_id":{"value":"tkkvzfon"}}
|
| 10 |
+
{"time":"2024-09-23T09:32:37.634282756Z","level":"INFO","msg":"wandb-core","!BADKEY":null}
|
| 11 |
+
{"time":"2024-09-23T09:32:37.636527894Z","level":"INFO","msg":"Starting system monitor"}
|
| 12 |
+
{"time":"2024-09-23T09:33:46.746283667Z","level":"INFO","msg":"stream: closing","id":"tkkvzfon"}
|
| 13 |
+
{"time":"2024-09-23T09:33:46.746349498Z","level":"INFO","msg":"Stopping system monitor"}
|
| 14 |
+
{"time":"2024-09-23T09:33:46.747359311Z","level":"INFO","msg":"Stopped system monitor"}
|
| 15 |
+
{"time":"2024-09-23T09:33:49.926631346Z","level":"INFO","msg":"handler: closed","stream_id":{"value":"tkkvzfon"}}
|
| 16 |
+
{"time":"2024-09-23T09:33:49.926725448Z","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"tkkvzfon"}}
|
| 17 |
+
{"time":"2024-09-23T09:33:49.926795918Z","level":"INFO","msg":"sender: closed","stream_id":{"value":"tkkvzfon"}}
|
| 18 |
+
{"time":"2024-09-23T09:33:49.927056922Z","level":"INFO","msg":"stream: closed","id":"tkkvzfon"}
|
2024-09-23/09-32-28/wandb/debug.log
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2024-09-23 09:32:37,221 INFO MainThread:81916 [wandb_setup.py:_flush():77] Current SDK version is 0.18.1
|
| 2 |
+
2024-09-23 09:32:37,221 INFO MainThread:81916 [wandb_setup.py:_flush():77] Configure stats pid to 81916
|
| 3 |
+
2024-09-23 09:32:37,221 INFO MainThread:81916 [wandb_setup.py:_flush():77] Loading settings from /root/.config/wandb/settings
|
| 4 |
+
2024-09-23 09:32:37,221 INFO MainThread:81916 [wandb_setup.py:_flush():77] Loading settings from /root/SuperTinyLanguageModels/outputs/2024-09-23/09-32-28/wandb/settings
|
| 5 |
+
2024-09-23 09:32:37,221 INFO MainThread:81916 [wandb_setup.py:_flush():77] Loading settings from environment variables: {}
|
| 6 |
+
2024-09-23 09:32:37,221 INFO MainThread:81916 [wandb_setup.py:_flush():77] Applying setup settings: {'mode': None, '_disable_service': None}
|
| 7 |
+
2024-09-23 09:32:37,221 INFO MainThread:81916 [wandb_setup.py:_flush():77] Inferring run settings from compute environment: {'program_relpath': 'train.py', 'program_abspath': '/root/SuperTinyLanguageModels/train.py', 'program': '/root/SuperTinyLanguageModels/train.py'}
|
| 8 |
+
2024-09-23 09:32:37,221 INFO MainThread:81916 [wandb_setup.py:_flush():77] Applying login settings: {}
|
| 9 |
+
2024-09-23 09:32:37,221 INFO MainThread:81916 [wandb_init.py:_log_setup():532] Logging user logs to /root/SuperTinyLanguageModels/outputs/2024-09-23/09-32-28/wandb/run-20240923_093237-tkkvzfon/logs/debug.log
|
| 10 |
+
2024-09-23 09:32:37,221 INFO MainThread:81916 [wandb_init.py:_log_setup():533] Logging internal logs to /root/SuperTinyLanguageModels/outputs/2024-09-23/09-32-28/wandb/run-20240923_093237-tkkvzfon/logs/debug-internal.log
|
| 11 |
+
2024-09-23 09:32:37,222 INFO MainThread:81916 [wandb_init.py:init():616] calling init triggers
|
| 12 |
+
2024-09-23 09:32:37,222 INFO MainThread:81916 [wandb_init.py:init():623] wandb.init called with sweep_config: {}
|
| 13 |
+
config: {'model': {'core_model_type': 'pass_through', 'hidden_dim': 384, 'byte_hidden': 128, 'max_chunk_length': 12, 'max_num_chunks': 1024, 'num_delimiter_layers': 3, 'num_byte_decoder_layers': 5, 'target_chunk_len': 8.0, 'chunk_len_loss_weight': 0.1, 'chunk_len_penalty': 0.1, 'context_window': 8192, 'embedding_model_type': 'byte_level', 'tokenizer_type': 'bpe', 'tokenizer_dataset_name': 'simple_en_wiki', 'tokenizer_simplify_data': True, 'vocab_size': 259, 'lm_head_type': 'byte_level', 'lm_head_normalization': 'rms_norm', 'lm_head_bias': False, 'lm_head_dropout': 0.0, 'model_shell_type': 'byte_autoencoder_shell', 'embedding_weight_tying': True, 'ffn_weight_tying': False, 'cproj_weight_tying': False, 'positional_encoding_type': 'rope'}, 'trainer': {'trainer_type': 'base_trainer', 'dataset': 'fineweb_edu_10B', 'batch_size': 6, 'gradient_accumulation_steps': 8, 'max_iters': 10000, 'eval_interval': 50000000, 'log_interval': 1, 'checkpoint_interval': 1000, 'eval_iters': 1000, 'run_eval': False, 'eval': {'mcq_benchmarks': None, 'mcq_num_samples': 1000, 'eval_byte_metrics': False, 'text_modeling_eval': False, 'text_generation_eval': False}, 'optimizer': {'optimizer_name': 'adamW', 'lr': 0.0005, 'min_lr': 5e-05, 'weight_decay': 0.01, 'beta1': 0.9, 'beta2': 0.95, 'grad_clip': 1.0}, 'lr_scheduler': {'name': 'cosine', 'warmup_iters': 100}, 'dataloader': {'name': 'autoencoder'}, 'datasampling': {'name': 'standard'}, 'loss_fn': {'name': 'pass_through'}}, 'general': {'logging': {'wandb_log': True, 'wandb_project': 'SuperTinyLanguageModels', 'wandb_run_name': None, 'group_name': 'experimental_byte_level'}, 'paths': {'output_dir': 'outputs', 'data_dir': '/root/SuperTinyLanguageModels/data', 'checkpoint_dir': 'checkpoints', 'eval_dir': '/root/SuperTinyLanguageModels/evals'}, 'seed': 489, 'device': 'cuda'}}
|
| 14 |
+
2024-09-23 09:32:37,222 INFO MainThread:81916 [wandb_init.py:init():666] starting backend
|
| 15 |
+
2024-09-23 09:32:37,222 INFO MainThread:81916 [wandb_init.py:init():670] setting up manager
|
| 16 |
+
2024-09-23 09:32:37,223 INFO MainThread:81916 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
| 17 |
+
2024-09-23 09:32:37,224 INFO MainThread:81916 [wandb_init.py:init():678] backend started and connected
|
| 18 |
+
2024-09-23 09:32:37,227 INFO MainThread:81916 [wandb_init.py:init():773] updated telemetry
|
| 19 |
+
2024-09-23 09:32:37,236 INFO MainThread:81916 [wandb_init.py:init():806] communicating run to backend with 90.0 second timeout
|
| 20 |
+
2024-09-23 09:32:37,631 INFO MainThread:81916 [wandb_init.py:init():857] starting run threads in backend
|
| 21 |
+
2024-09-23 09:32:37,802 INFO MainThread:81916 [wandb_run.py:_console_start():2459] atexit reg
|
| 22 |
+
2024-09-23 09:32:37,802 INFO MainThread:81916 [wandb_run.py:_redirect():2307] redirect: wrap_raw
|
| 23 |
+
2024-09-23 09:32:37,803 INFO MainThread:81916 [wandb_run.py:_redirect():2372] Wrapping output streams.
|
| 24 |
+
2024-09-23 09:32:37,803 INFO MainThread:81916 [wandb_run.py:_redirect():2397] Redirects installed.
|
| 25 |
+
2024-09-23 09:32:37,806 INFO MainThread:81916 [wandb_init.py:init():900] run started, returning control to user process
|
| 26 |
+
2024-09-23 09:33:46,746 WARNING MsgRouterThr:81916 [router.py:message_loop():77] message_loop has been closed
|
2024-09-23/09-32-28/wandb/run-20240923_093237-tkkvzfon/files/config.yaml
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
_wandb:
|
| 2 |
+
value:
|
| 3 |
+
cli_version: 0.18.1
|
| 4 |
+
m: []
|
| 5 |
+
python_version: 3.10.14
|
| 6 |
+
t:
|
| 7 |
+
"1":
|
| 8 |
+
- 1
|
| 9 |
+
- 5
|
| 10 |
+
- 11
|
| 11 |
+
- 49
|
| 12 |
+
- 50
|
| 13 |
+
- 51
|
| 14 |
+
- 53
|
| 15 |
+
- 55
|
| 16 |
+
"2":
|
| 17 |
+
- 1
|
| 18 |
+
- 5
|
| 19 |
+
- 11
|
| 20 |
+
- 49
|
| 21 |
+
- 50
|
| 22 |
+
- 51
|
| 23 |
+
- 53
|
| 24 |
+
- 55
|
| 25 |
+
"3":
|
| 26 |
+
- 13
|
| 27 |
+
- 15
|
| 28 |
+
- 16
|
| 29 |
+
- 23
|
| 30 |
+
- 55
|
| 31 |
+
- 61
|
| 32 |
+
"4": 3.10.14
|
| 33 |
+
"5": 0.18.1
|
| 34 |
+
"6": 4.44.2
|
| 35 |
+
"8":
|
| 36 |
+
- 5
|
| 37 |
+
- 9
|
| 38 |
+
"12": 0.18.1
|
| 39 |
+
"13": linux-x86_64
|
| 40 |
+
general:
|
| 41 |
+
value:
|
| 42 |
+
device: cuda
|
| 43 |
+
logging:
|
| 44 |
+
group_name: experimental_byte_level
|
| 45 |
+
wandb_log: true
|
| 46 |
+
wandb_project: SuperTinyLanguageModels
|
| 47 |
+
wandb_run_name: null
|
| 48 |
+
paths:
|
| 49 |
+
checkpoint_dir: checkpoints
|
| 50 |
+
data_dir: /root/SuperTinyLanguageModels/data
|
| 51 |
+
eval_dir: /root/SuperTinyLanguageModels/evals
|
| 52 |
+
output_dir: outputs
|
| 53 |
+
seed: 489
|
| 54 |
+
model:
|
| 55 |
+
value:
|
| 56 |
+
byte_hidden: 128
|
| 57 |
+
chunk_len_loss_weight: 0.1
|
| 58 |
+
chunk_len_penalty: 0.1
|
| 59 |
+
context_window: 8192
|
| 60 |
+
core_model_type: pass_through
|
| 61 |
+
cproj_weight_tying: false
|
| 62 |
+
embedding_model_type: byte_level
|
| 63 |
+
embedding_weight_tying: true
|
| 64 |
+
ffn_weight_tying: false
|
| 65 |
+
hidden_dim: 384
|
| 66 |
+
lm_head_bias: false
|
| 67 |
+
lm_head_dropout: 0
|
| 68 |
+
lm_head_normalization: rms_norm
|
| 69 |
+
lm_head_type: byte_level
|
| 70 |
+
max_chunk_length: 12
|
| 71 |
+
max_num_chunks: 1024
|
| 72 |
+
model_shell_type: byte_autoencoder_shell
|
| 73 |
+
num_byte_decoder_layers: 5
|
| 74 |
+
num_delimiter_layers: 3
|
| 75 |
+
positional_encoding_type: rope
|
| 76 |
+
target_chunk_len: 8
|
| 77 |
+
tokenizer_dataset_name: simple_en_wiki
|
| 78 |
+
tokenizer_simplify_data: true
|
| 79 |
+
tokenizer_type: bpe
|
| 80 |
+
vocab_size: 259
|
| 81 |
+
trainer:
|
| 82 |
+
value:
|
| 83 |
+
batch_size: 6
|
| 84 |
+
checkpoint_interval: 1000
|
| 85 |
+
dataloader:
|
| 86 |
+
name: autoencoder
|
| 87 |
+
datasampling:
|
| 88 |
+
name: standard
|
| 89 |
+
dataset: fineweb_edu_10B
|
| 90 |
+
eval:
|
| 91 |
+
eval_byte_metrics: false
|
| 92 |
+
mcq_benchmarks: null
|
| 93 |
+
mcq_num_samples: 1000
|
| 94 |
+
text_generation_eval: false
|
| 95 |
+
text_modeling_eval: false
|
| 96 |
+
eval_interval: 50000000
|
| 97 |
+
eval_iters: 1000
|
| 98 |
+
gradient_accumulation_steps: 8
|
| 99 |
+
log_interval: 1
|
| 100 |
+
loss_fn:
|
| 101 |
+
name: pass_through
|
| 102 |
+
lr_scheduler:
|
| 103 |
+
name: cosine
|
| 104 |
+
warmup_iters: 100
|
| 105 |
+
max_iters: 10000
|
| 106 |
+
optimizer:
|
| 107 |
+
beta1: 0.9
|
| 108 |
+
beta2: 0.95
|
| 109 |
+
grad_clip: 1
|
| 110 |
+
lr: 0.0005
|
| 111 |
+
min_lr: 5e-05
|
| 112 |
+
optimizer_name: adamW
|
| 113 |
+
weight_decay: 0.01
|
| 114 |
+
run_eval: false
|
| 115 |
+
trainer_type: base_trainer
|
2024-09-23/09-32-28/wandb/run-20240923_093237-tkkvzfon/files/output.log
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Weight and Biases Initialized
|
| 2 |
+
Rank0 Trainer built
|
| 3 |
+
Training loop is starting
|
| 4 |
+
All GPU(s): step 1: loss 10.4062, lr 5.0e-06, dt 2.1s
|
| 5 |
+
All GPU(s): step 2: loss 10.4297, lr 1.0e-05, dt 2.1s
|
| 6 |
+
All GPU(s): step 3: loss 10.3672, lr 1.5e-05, dt 2.1s
|
| 7 |
+
All GPU(s): step 4: loss 10.3203, lr 2.0e-05, dt 2.1s
|
| 8 |
+
All GPU(s): step 5: loss 10.2344, lr 2.5e-05, dt 2.1s
|
| 9 |
+
All GPU(s): step 6: loss 10.1406, lr 3.0e-05, dt 2.1s
|
| 10 |
+
All GPU(s): step 7: loss 10.0234, lr 3.5e-05, dt 2.1s
|
| 11 |
+
All GPU(s): step 8: loss 9.9688, lr 4.0e-05, dt 2.1s
|
| 12 |
+
All GPU(s): step 9: loss 9.8594, lr 4.5e-05, dt 2.2s
|
| 13 |
+
All GPU(s): step 10: loss 9.6328, lr 5.0e-05, dt 2.1s
|
| 14 |
+
All GPU(s): step 11: loss 9.5312, lr 5.5e-05, dt 2.1s
|
| 15 |
+
All GPU(s): step 12: loss 9.3750, lr 6.0e-05, dt 2.1s
|
| 16 |
+
All GPU(s): step 13: loss 9.2109, lr 6.5e-05, dt 2.1s
|
| 17 |
+
All GPU(s): step 14: loss 9.0078, lr 7.0e-05, dt 2.1s
|
| 18 |
+
All GPU(s): step 15: loss 8.8203, lr 7.5e-05, dt 2.1s
|
| 19 |
+
All GPU(s): step 16: loss 8.6562, lr 8.0e-05, dt 2.0s
|
| 20 |
+
All GPU(s): step 17: loss 8.4922, lr 8.5e-05, dt 2.1s
|
| 21 |
+
All GPU(s): step 18: loss 8.2891, lr 9.0e-05, dt 2.1s
|
| 22 |
+
All GPU(s): step 19: loss 8.1328, lr 9.5e-05, dt 2.1s
|
| 23 |
+
All GPU(s): step 20: loss 7.9414, lr 1.0e-04, dt 2.0s
|
| 24 |
+
All GPU(s): step 21: loss 7.7852, lr 1.1e-04, dt 2.1s
|
| 25 |
+
All GPU(s): step 22: loss 7.5977, lr 1.1e-04, dt 2.1s
|
| 26 |
+
All GPU(s): step 23: loss 7.4453, lr 1.2e-04, dt 2.1s
|
| 27 |
+
All GPU(s): step 24: loss 7.3164, lr 1.2e-04, dt 2.1s
|
| 28 |
+
All GPU(s): step 25: loss 7.1836, lr 1.3e-04, dt 2.1s
|
| 29 |
+
All GPU(s): step 26: loss 7.1406, lr 1.3e-04, dt 2.1s
|
| 30 |
+
All GPU(s): step 27: loss 6.9414, lr 1.4e-04, dt 2.1s
|
| 31 |
+
All GPU(s): step 28: loss 6.8633, lr 1.4e-04, dt 2.2s
|
| 32 |
+
All GPU(s): step 29: loss 6.7461, lr 1.5e-04, dt 2.1s
|
| 33 |
+
All GPU(s): step 30: loss 6.5742, lr 1.5e-04, dt 2.1s
|
2024-09-23/09-32-28/wandb/run-20240923_093237-tkkvzfon/files/wandb-metadata.json
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"os": "Linux-5.15.0-117-generic-x86_64-with-glibc2.31",
|
| 3 |
+
"python": "3.10.14",
|
| 4 |
+
"startedAt": "2024-09-23T09:32:37.224689Z",
|
| 5 |
+
"args": [
|
| 6 |
+
"--config-name",
|
| 7 |
+
"experimental/byte_autoencoder_1"
|
| 8 |
+
],
|
| 9 |
+
"program": "/root/SuperTinyLanguageModels/train.py",
|
| 10 |
+
"codePath": "train.py",
|
| 11 |
+
"git": {
|
| 12 |
+
"remote": "https://github.com/LeonGuertler/SuperTinyLanguageModels.git",
|
| 13 |
+
"commit": "c36bf6b78927d4d365c52a835f0e178edacbab29"
|
| 14 |
+
},
|
| 15 |
+
"email": "calvin14@gmail.com",
|
| 16 |
+
"root": "/root/SuperTinyLanguageModels/outputs/2024-09-23/09-32-28",
|
| 17 |
+
"host": "11c6e13f6a55",
|
| 18 |
+
"username": "root",
|
| 19 |
+
"executable": "/root/SuperTinyLanguageModels/.conda/bin/python3",
|
| 20 |
+
"cpu_count": 128,
|
| 21 |
+
"cpu_count_logical": 256,
|
| 22 |
+
"gpu": "[NVIDIA GeForce RTX 4090, NVIDIA GeForce RTX 4090, NVIDIA GeForce RTX 4090, NVIDIA GeForce RTX 4090, NVIDIA GeForce RTX 4090, NVIDIA GeForce RTX 4090, NVIDIA GeForce RTX 4090, NVIDIA GeForce RTX 4090]",
|
| 23 |
+
"gpu_count": 8,
|
| 24 |
+
"disk": {
|
| 25 |
+
"/": {
|
| 26 |
+
"total": "1123133947904",
|
| 27 |
+
"used": "551794495488"
|
| 28 |
+
}
|
| 29 |
+
},
|
| 30 |
+
"memory": {
|
| 31 |
+
"total": "540812599296"
|
| 32 |
+
},
|
| 33 |
+
"cpu": {
|
| 34 |
+
"count": 128,
|
| 35 |
+
"countLogical": 256
|
| 36 |
+
},
|
| 37 |
+
"gpu_nvidia": [
|
| 38 |
+
{
|
| 39 |
+
"name": "NVIDIA GeForce RTX 4090",
|
| 40 |
+
"memoryTotal": "25757220864",
|
| 41 |
+
"cudaCores": 16384,
|
| 42 |
+
"architecture": "Ada"
|
| 43 |
+
},
|
| 44 |
+
{
|
| 45 |
+
"name": "NVIDIA GeForce RTX 4090",
|
| 46 |
+
"memoryTotal": "25757220864",
|
| 47 |
+
"cudaCores": 16384,
|
| 48 |
+
"architecture": "Ada"
|
| 49 |
+
},
|
| 50 |
+
{
|
| 51 |
+
"name": "NVIDIA GeForce RTX 4090",
|
| 52 |
+
"memoryTotal": "25757220864",
|
| 53 |
+
"cudaCores": 16384,
|
| 54 |
+
"architecture": "Ada"
|
| 55 |
+
},
|
| 56 |
+
{
|
| 57 |
+
"name": "NVIDIA GeForce RTX 4090",
|
| 58 |
+
"memoryTotal": "25757220864",
|
| 59 |
+
"cudaCores": 16384,
|
| 60 |
+
"architecture": "Ada"
|
| 61 |
+
},
|
| 62 |
+
{
|
| 63 |
+
"name": "NVIDIA GeForce RTX 4090",
|
| 64 |
+
"memoryTotal": "25757220864",
|
| 65 |
+
"cudaCores": 16384,
|
| 66 |
+
"architecture": "Ada"
|
| 67 |
+
},
|
| 68 |
+
{
|
| 69 |
+
"name": "NVIDIA GeForce RTX 4090",
|
| 70 |
+
"memoryTotal": "25757220864",
|
| 71 |
+
"cudaCores": 16384,
|
| 72 |
+
"architecture": "Ada"
|
| 73 |
+
},
|
| 74 |
+
{
|
| 75 |
+
"name": "NVIDIA GeForce RTX 4090",
|
| 76 |
+
"memoryTotal": "25757220864",
|
| 77 |
+
"cudaCores": 16384,
|
| 78 |
+
"architecture": "Ada"
|
| 79 |
+
},
|
| 80 |
+
{
|
| 81 |
+
"name": "NVIDIA GeForce RTX 4090",
|
| 82 |
+
"memoryTotal": "25757220864",
|
| 83 |
+
"cudaCores": 16384,
|
| 84 |
+
"architecture": "Ada"
|
| 85 |
+
}
|
| 86 |
+
],
|
| 87 |
+
"cudaVersion": "12.5"
|
| 88 |
+
}
|
2024-09-23/09-32-28/wandb/run-20240923_093237-tkkvzfon/files/wandb-summary.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"additional_info/chunk_len_penalty_loss":0,"additional_info/total-loss":6.543508529663086,"_step":1474560,"additional_info/chunk_len_loss":2.0561606884002686,"iter":30,"token_num":1474560,"additional_info/BCE-loss":4.487347602844238,"loss":6.57421875,"lr":0.00015,"_timestamp":1.7270840240730202e+09,"_runtime":69.521643938,"additional_info/average_chunk_length":3.4655094146728516,"_wandb":{"runtime":69}}
|
2024-09-23/09-32-28/wandb/run-20240923_093237-tkkvzfon/logs/debug-core.log
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2024-09-23T09:32:36.53490736Z","level":"INFO","msg":"started logging, with flags","port-filename":"/tmp/tmppr55fcxh/port-81916.txt","pid":81916,"debug":false,"disable-analytics":false}
|
| 2 |
+
{"time":"2024-09-23T09:32:36.534984841Z","level":"INFO","msg":"FeatureState","shutdownOnParentExitEnabled":false}
|
| 3 |
+
{"time":"2024-09-23T09:32:36.551541231Z","level":"INFO","msg":"Will exit if parent process dies.","ppid":81916}
|
| 4 |
+
{"time":"2024-09-23T09:32:36.55148544Z","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":44587,"Zone":""}}
|
| 5 |
+
{"time":"2024-09-23T09:32:36.722786198Z","level":"INFO","msg":"created new connection","id":"127.0.0.1:60908"}
|
| 6 |
+
{"time":"2024-09-23T09:32:37.226730857Z","level":"INFO","msg":"connection init received","streamId":"tkkvzfon","id":"127.0.0.1:60908"}
|
| 7 |
+
{"time":"2024-09-23T09:32:37.227116001Z","level":"ERROR","msg":"error creating symlink","error":"symlink /root/.cache/wandb/logs/core-debug-20240923_093236.log /root/SuperTinyLanguageModels/outputs/2024-09-23/09-32-28/wandb/run-20240923_093237-tkkvzfon/logs/debug-core.log: file exists"}
|
| 8 |
+
{"time":"2024-09-23T09:32:37.230871019Z","level":"INFO","msg":"connection init completed","streamId":"tkkvzfon","id":"127.0.0.1:60908"}
|
| 9 |
+
{"time":"2024-09-23T09:33:46.746114105Z","level":"INFO","msg":"connection: teardown","id":"127.0.0.1:60908"}
|
| 10 |
+
{"time":"2024-09-23T09:33:46.746363968Z","level":"INFO","msg":"server is shutting down"}
|
| 11 |
+
{"time":"2024-09-23T09:33:46.746627582Z","level":"INFO","msg":"closed connection","id":"127.0.0.1:60908"}
|
| 12 |
+
{"time":"2024-09-23T09:33:49.927260015Z","level":"INFO","msg":"connection closed","id":"127.0.0.1:60908"}
|
| 13 |
+
{"time":"2024-09-23T09:33:49.927297555Z","level":"INFO","msg":"server is closed"}
|
2024-09-23/09-32-28/wandb/run-20240923_093237-tkkvzfon/logs/debug-internal.log
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2024-09-23T09:32:37.2270228Z","level":"INFO","msg":"using version","core version":"0.18.1"}
|
| 2 |
+
{"time":"2024-09-23T09:32:37.227060611Z","level":"INFO","msg":"created symlink","path":"/root/SuperTinyLanguageModels/outputs/2024-09-23/09-32-28/wandb/run-20240923_093237-tkkvzfon/logs/debug-core.log"}
|
| 3 |
+
{"time":"2024-09-23T09:32:37.227169702Z","level":"INFO","msg":"using version","core version":"0.18.1"}
|
| 4 |
+
{"time":"2024-09-23T09:32:37.227182172Z","level":"INFO","msg":"created symlink","path":"/root/SuperTinyLanguageModels/outputs/2024-09-23/09-32-28/wandb/run-20240923_093237-tkkvzfon/logs/debug-core.log"}
|
| 5 |
+
{"time":"2024-09-23T09:32:37.230824708Z","level":"INFO","msg":"created new stream","id":"tkkvzfon"}
|
| 6 |
+
{"time":"2024-09-23T09:32:37.230859859Z","level":"INFO","msg":"stream: started","id":"tkkvzfon"}
|
| 7 |
+
{"time":"2024-09-23T09:32:37.230903499Z","level":"INFO","msg":"sender: started","stream_id":{"value":"tkkvzfon"}}
|
| 8 |
+
{"time":"2024-09-23T09:32:37.23092371Z","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"tkkvzfon"}}
|
| 9 |
+
{"time":"2024-09-23T09:32:37.23097304Z","level":"INFO","msg":"handler: started","stream_id":{"value":"tkkvzfon"}}
|
| 10 |
+
{"time":"2024-09-23T09:32:37.634282756Z","level":"INFO","msg":"wandb-core","!BADKEY":null}
|
| 11 |
+
{"time":"2024-09-23T09:32:37.636527894Z","level":"INFO","msg":"Starting system monitor"}
|
| 12 |
+
{"time":"2024-09-23T09:33:46.746283667Z","level":"INFO","msg":"stream: closing","id":"tkkvzfon"}
|
| 13 |
+
{"time":"2024-09-23T09:33:46.746349498Z","level":"INFO","msg":"Stopping system monitor"}
|
| 14 |
+
{"time":"2024-09-23T09:33:46.747359311Z","level":"INFO","msg":"Stopped system monitor"}
|
| 15 |
+
{"time":"2024-09-23T09:33:49.926631346Z","level":"INFO","msg":"handler: closed","stream_id":{"value":"tkkvzfon"}}
|
| 16 |
+
{"time":"2024-09-23T09:33:49.926725448Z","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"tkkvzfon"}}
|
| 17 |
+
{"time":"2024-09-23T09:33:49.926795918Z","level":"INFO","msg":"sender: closed","stream_id":{"value":"tkkvzfon"}}
|
| 18 |
+
{"time":"2024-09-23T09:33:49.927056922Z","level":"INFO","msg":"stream: closed","id":"tkkvzfon"}
|
2024-09-23/09-32-28/wandb/run-20240923_093237-tkkvzfon/logs/debug.log
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2024-09-23 09:32:37,221 INFO MainThread:81916 [wandb_setup.py:_flush():77] Current SDK version is 0.18.1
|
| 2 |
+
2024-09-23 09:32:37,221 INFO MainThread:81916 [wandb_setup.py:_flush():77] Configure stats pid to 81916
|
| 3 |
+
2024-09-23 09:32:37,221 INFO MainThread:81916 [wandb_setup.py:_flush():77] Loading settings from /root/.config/wandb/settings
|
| 4 |
+
2024-09-23 09:32:37,221 INFO MainThread:81916 [wandb_setup.py:_flush():77] Loading settings from /root/SuperTinyLanguageModels/outputs/2024-09-23/09-32-28/wandb/settings
|
| 5 |
+
2024-09-23 09:32:37,221 INFO MainThread:81916 [wandb_setup.py:_flush():77] Loading settings from environment variables: {}
|
| 6 |
+
2024-09-23 09:32:37,221 INFO MainThread:81916 [wandb_setup.py:_flush():77] Applying setup settings: {'mode': None, '_disable_service': None}
|
| 7 |
+
2024-09-23 09:32:37,221 INFO MainThread:81916 [wandb_setup.py:_flush():77] Inferring run settings from compute environment: {'program_relpath': 'train.py', 'program_abspath': '/root/SuperTinyLanguageModels/train.py', 'program': '/root/SuperTinyLanguageModels/train.py'}
|
| 8 |
+
2024-09-23 09:32:37,221 INFO MainThread:81916 [wandb_setup.py:_flush():77] Applying login settings: {}
|
| 9 |
+
2024-09-23 09:32:37,221 INFO MainThread:81916 [wandb_init.py:_log_setup():532] Logging user logs to /root/SuperTinyLanguageModels/outputs/2024-09-23/09-32-28/wandb/run-20240923_093237-tkkvzfon/logs/debug.log
|
| 10 |
+
2024-09-23 09:32:37,221 INFO MainThread:81916 [wandb_init.py:_log_setup():533] Logging internal logs to /root/SuperTinyLanguageModels/outputs/2024-09-23/09-32-28/wandb/run-20240923_093237-tkkvzfon/logs/debug-internal.log
|
| 11 |
+
2024-09-23 09:32:37,222 INFO MainThread:81916 [wandb_init.py:init():616] calling init triggers
|
| 12 |
+
2024-09-23 09:32:37,222 INFO MainThread:81916 [wandb_init.py:init():623] wandb.init called with sweep_config: {}
|
| 13 |
+
config: {'model': {'core_model_type': 'pass_through', 'hidden_dim': 384, 'byte_hidden': 128, 'max_chunk_length': 12, 'max_num_chunks': 1024, 'num_delimiter_layers': 3, 'num_byte_decoder_layers': 5, 'target_chunk_len': 8.0, 'chunk_len_loss_weight': 0.1, 'chunk_len_penalty': 0.1, 'context_window': 8192, 'embedding_model_type': 'byte_level', 'tokenizer_type': 'bpe', 'tokenizer_dataset_name': 'simple_en_wiki', 'tokenizer_simplify_data': True, 'vocab_size': 259, 'lm_head_type': 'byte_level', 'lm_head_normalization': 'rms_norm', 'lm_head_bias': False, 'lm_head_dropout': 0.0, 'model_shell_type': 'byte_autoencoder_shell', 'embedding_weight_tying': True, 'ffn_weight_tying': False, 'cproj_weight_tying': False, 'positional_encoding_type': 'rope'}, 'trainer': {'trainer_type': 'base_trainer', 'dataset': 'fineweb_edu_10B', 'batch_size': 6, 'gradient_accumulation_steps': 8, 'max_iters': 10000, 'eval_interval': 50000000, 'log_interval': 1, 'checkpoint_interval': 1000, 'eval_iters': 1000, 'run_eval': False, 'eval': {'mcq_benchmarks': None, 'mcq_num_samples': 1000, 'eval_byte_metrics': False, 'text_modeling_eval': False, 'text_generation_eval': False}, 'optimizer': {'optimizer_name': 'adamW', 'lr': 0.0005, 'min_lr': 5e-05, 'weight_decay': 0.01, 'beta1': 0.9, 'beta2': 0.95, 'grad_clip': 1.0}, 'lr_scheduler': {'name': 'cosine', 'warmup_iters': 100}, 'dataloader': {'name': 'autoencoder'}, 'datasampling': {'name': 'standard'}, 'loss_fn': {'name': 'pass_through'}}, 'general': {'logging': {'wandb_log': True, 'wandb_project': 'SuperTinyLanguageModels', 'wandb_run_name': None, 'group_name': 'experimental_byte_level'}, 'paths': {'output_dir': 'outputs', 'data_dir': '/root/SuperTinyLanguageModels/data', 'checkpoint_dir': 'checkpoints', 'eval_dir': '/root/SuperTinyLanguageModels/evals'}, 'seed': 489, 'device': 'cuda'}}
|
| 14 |
+
2024-09-23 09:32:37,222 INFO MainThread:81916 [wandb_init.py:init():666] starting backend
|
| 15 |
+
2024-09-23 09:32:37,222 INFO MainThread:81916 [wandb_init.py:init():670] setting up manager
|
| 16 |
+
2024-09-23 09:32:37,223 INFO MainThread:81916 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
| 17 |
+
2024-09-23 09:32:37,224 INFO MainThread:81916 [wandb_init.py:init():678] backend started and connected
|
| 18 |
+
2024-09-23 09:32:37,227 INFO MainThread:81916 [wandb_init.py:init():773] updated telemetry
|
| 19 |
+
2024-09-23 09:32:37,236 INFO MainThread:81916 [wandb_init.py:init():806] communicating run to backend with 90.0 second timeout
|
| 20 |
+
2024-09-23 09:32:37,631 INFO MainThread:81916 [wandb_init.py:init():857] starting run threads in backend
|
| 21 |
+
2024-09-23 09:32:37,802 INFO MainThread:81916 [wandb_run.py:_console_start():2459] atexit reg
|
| 22 |
+
2024-09-23 09:32:37,802 INFO MainThread:81916 [wandb_run.py:_redirect():2307] redirect: wrap_raw
|
| 23 |
+
2024-09-23 09:32:37,803 INFO MainThread:81916 [wandb_run.py:_redirect():2372] Wrapping output streams.
|
| 24 |
+
2024-09-23 09:32:37,803 INFO MainThread:81916 [wandb_run.py:_redirect():2397] Redirects installed.
|
| 25 |
+
2024-09-23 09:32:37,806 INFO MainThread:81916 [wandb_init.py:init():900] run started, returning control to user process
|
| 26 |
+
2024-09-23 09:33:46,746 WARNING MsgRouterThr:81916 [router.py:message_loop():77] message_loop has been closed
|
2024-09-23/09-32-28/wandb/run-20240923_093237-tkkvzfon/run-tkkvzfon.wandb
ADDED
|
Binary file (124 kB). View file
|
|
|
2024-09-23/09-33-58/.hydra/config.yaml
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
experimental:
|
| 2 |
+
model:
|
| 3 |
+
core_model_type: pass_through
|
| 4 |
+
hidden_dim: 384
|
| 5 |
+
byte_hidden: 128
|
| 6 |
+
max_chunk_length: 12
|
| 7 |
+
max_num_chunks: 1024
|
| 8 |
+
num_delimiter_layers: 3
|
| 9 |
+
num_byte_decoder_layers: 5
|
| 10 |
+
target_chunk_len: 8.0
|
| 11 |
+
chunk_len_loss_weight: 0.1
|
| 12 |
+
chunk_len_penalty: 0.1
|
| 13 |
+
context_window: 8192
|
| 14 |
+
embedding_model_type: byte_level
|
| 15 |
+
tokenizer_type: bpe
|
| 16 |
+
tokenizer_dataset_name: simple_en_wiki
|
| 17 |
+
tokenizer_simplify_data: true
|
| 18 |
+
vocab_size: 259
|
| 19 |
+
lm_head_type: byte_level
|
| 20 |
+
lm_head_normalization: rms_norm
|
| 21 |
+
lm_head_bias: false
|
| 22 |
+
lm_head_dropout: 0.0
|
| 23 |
+
model_shell_type: byte_autoencoder_shell
|
| 24 |
+
embedding_weight_tying: true
|
| 25 |
+
ffn_weight_tying: false
|
| 26 |
+
cproj_weight_tying: false
|
| 27 |
+
positional_encoding_type: rope
|
| 28 |
+
trainer:
|
| 29 |
+
trainer_type: base_trainer
|
| 30 |
+
dataset: fineweb_edu_10B
|
| 31 |
+
batch_size: 6
|
| 32 |
+
gradient_accumulation_steps: 8
|
| 33 |
+
max_iters: 10000
|
| 34 |
+
eval_interval: 50000000
|
| 35 |
+
log_interval: 1
|
| 36 |
+
checkpoint_interval: 1000
|
| 37 |
+
eval_iters: 1000
|
| 38 |
+
run_eval: false
|
| 39 |
+
eval:
|
| 40 |
+
mcq_benchmarks: null
|
| 41 |
+
mcq_num_samples: 1000
|
| 42 |
+
eval_byte_metrics: false
|
| 43 |
+
text_modeling_eval: false
|
| 44 |
+
text_generation_eval: false
|
| 45 |
+
optimizer:
|
| 46 |
+
optimizer_name: adamW
|
| 47 |
+
lr: 0.0005
|
| 48 |
+
min_lr: 5.0e-05
|
| 49 |
+
weight_decay: 0.01
|
| 50 |
+
beta1: 0.9
|
| 51 |
+
beta2: 0.95
|
| 52 |
+
grad_clip: 1.0
|
| 53 |
+
lr_scheduler:
|
| 54 |
+
name: cosine
|
| 55 |
+
warmup_iters: 100
|
| 56 |
+
dataloader:
|
| 57 |
+
name: autoencoder
|
| 58 |
+
datasampling:
|
| 59 |
+
name: standard
|
| 60 |
+
loss_fn:
|
| 61 |
+
name: pass_through
|
| 62 |
+
general:
|
| 63 |
+
logging:
|
| 64 |
+
wandb_log: true
|
| 65 |
+
wandb_project: SuperTinyLanguageModels
|
| 66 |
+
wandb_run_name: null
|
| 67 |
+
group_name: experimental_byte_level
|
| 68 |
+
paths:
|
| 69 |
+
output_dir: outputs
|
| 70 |
+
data_dir: data
|
| 71 |
+
checkpoint_dir: checkpoints
|
| 72 |
+
eval_dir: evals
|
| 73 |
+
seed: 489
|
| 74 |
+
device: cuda
|
2024-09-23/09-33-58/.hydra/hydra.yaml
ADDED
|
@@ -0,0 +1,154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
hydra:
|
| 2 |
+
run:
|
| 3 |
+
dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
|
| 4 |
+
sweep:
|
| 5 |
+
dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
|
| 6 |
+
subdir: ${hydra.job.num}
|
| 7 |
+
launcher:
|
| 8 |
+
_target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
|
| 9 |
+
sweeper:
|
| 10 |
+
_target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
|
| 11 |
+
max_batch_size: null
|
| 12 |
+
params: null
|
| 13 |
+
help:
|
| 14 |
+
app_name: ${hydra.job.name}
|
| 15 |
+
header: '${hydra.help.app_name} is powered by Hydra.
|
| 16 |
+
|
| 17 |
+
'
|
| 18 |
+
footer: 'Powered by Hydra (https://hydra.cc)
|
| 19 |
+
|
| 20 |
+
Use --hydra-help to view Hydra specific help
|
| 21 |
+
|
| 22 |
+
'
|
| 23 |
+
template: '${hydra.help.header}
|
| 24 |
+
|
| 25 |
+
== Configuration groups ==
|
| 26 |
+
|
| 27 |
+
Compose your configuration from those groups (group=option)
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
$APP_CONFIG_GROUPS
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
== Config ==
|
| 34 |
+
|
| 35 |
+
Override anything in the config (foo.bar=value)
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
$CONFIG
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
${hydra.help.footer}
|
| 42 |
+
|
| 43 |
+
'
|
| 44 |
+
hydra_help:
|
| 45 |
+
template: 'Hydra (${hydra.runtime.version})
|
| 46 |
+
|
| 47 |
+
See https://hydra.cc for more info.
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
== Flags ==
|
| 51 |
+
|
| 52 |
+
$FLAGS_HELP
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
== Configuration groups ==
|
| 56 |
+
|
| 57 |
+
Compose your configuration from those groups (For example, append hydra/job_logging=disabled
|
| 58 |
+
to command line)
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
$HYDRA_CONFIG_GROUPS
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
Use ''--cfg hydra'' to Show the Hydra config.
|
| 65 |
+
|
| 66 |
+
'
|
| 67 |
+
hydra_help: ???
|
| 68 |
+
hydra_logging:
|
| 69 |
+
version: 1
|
| 70 |
+
formatters:
|
| 71 |
+
simple:
|
| 72 |
+
format: '[%(asctime)s][HYDRA] %(message)s'
|
| 73 |
+
handlers:
|
| 74 |
+
console:
|
| 75 |
+
class: logging.StreamHandler
|
| 76 |
+
formatter: simple
|
| 77 |
+
stream: ext://sys.stdout
|
| 78 |
+
root:
|
| 79 |
+
level: INFO
|
| 80 |
+
handlers:
|
| 81 |
+
- console
|
| 82 |
+
loggers:
|
| 83 |
+
logging_example:
|
| 84 |
+
level: DEBUG
|
| 85 |
+
disable_existing_loggers: false
|
| 86 |
+
job_logging:
|
| 87 |
+
version: 1
|
| 88 |
+
formatters:
|
| 89 |
+
simple:
|
| 90 |
+
format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
|
| 91 |
+
handlers:
|
| 92 |
+
console:
|
| 93 |
+
class: logging.StreamHandler
|
| 94 |
+
formatter: simple
|
| 95 |
+
stream: ext://sys.stdout
|
| 96 |
+
file:
|
| 97 |
+
class: logging.FileHandler
|
| 98 |
+
formatter: simple
|
| 99 |
+
filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
|
| 100 |
+
root:
|
| 101 |
+
level: INFO
|
| 102 |
+
handlers:
|
| 103 |
+
- console
|
| 104 |
+
- file
|
| 105 |
+
disable_existing_loggers: false
|
| 106 |
+
env: {}
|
| 107 |
+
mode: RUN
|
| 108 |
+
searchpath: []
|
| 109 |
+
callbacks: {}
|
| 110 |
+
output_subdir: .hydra
|
| 111 |
+
overrides:
|
| 112 |
+
hydra:
|
| 113 |
+
- hydra.mode=RUN
|
| 114 |
+
task: []
|
| 115 |
+
job:
|
| 116 |
+
name: train
|
| 117 |
+
chdir: null
|
| 118 |
+
override_dirname: ''
|
| 119 |
+
id: ???
|
| 120 |
+
num: ???
|
| 121 |
+
config_name: experimental/byte_autoencoder_1
|
| 122 |
+
env_set: {}
|
| 123 |
+
env_copy: []
|
| 124 |
+
config:
|
| 125 |
+
override_dirname:
|
| 126 |
+
kv_sep: '='
|
| 127 |
+
item_sep: ','
|
| 128 |
+
exclude_keys: []
|
| 129 |
+
runtime:
|
| 130 |
+
version: 1.3.2
|
| 131 |
+
version_base: '1.1'
|
| 132 |
+
cwd: /root/SuperTinyLanguageModels
|
| 133 |
+
config_sources:
|
| 134 |
+
- path: hydra.conf
|
| 135 |
+
schema: pkg
|
| 136 |
+
provider: hydra
|
| 137 |
+
- path: /root/SuperTinyLanguageModels/configs/train
|
| 138 |
+
schema: file
|
| 139 |
+
provider: main
|
| 140 |
+
- path: ''
|
| 141 |
+
schema: structured
|
| 142 |
+
provider: schema
|
| 143 |
+
output_dir: /root/SuperTinyLanguageModels/outputs/2024-09-23/09-33-58
|
| 144 |
+
choices:
|
| 145 |
+
hydra/env: default
|
| 146 |
+
hydra/callbacks: null
|
| 147 |
+
hydra/job_logging: default
|
| 148 |
+
hydra/hydra_logging: default
|
| 149 |
+
hydra/hydra_help: default
|
| 150 |
+
hydra/help: default
|
| 151 |
+
hydra/sweeper: basic
|
| 152 |
+
hydra/launcher: basic
|
| 153 |
+
hydra/output: default
|
| 154 |
+
verbose: false
|
2024-09-23/09-33-58/.hydra/overrides.yaml
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
[]
|
2024-09-23/09-33-58/checkpoints/ckpt_1000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d9e847e5371dfd2f5ac68ee97e737d4ab63d42fdde1c885d6ab4915a9b3ccf83
|
| 3 |
+
size 69377274
|
2024-09-23/09-33-58/checkpoints/ckpt_2000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:261a3f997548dd7b6a92a1a7a51b37b1d559a7b64547c95b98a336bdc2685da0
|
| 3 |
+
size 69377274
|
2024-09-23/09-33-58/checkpoints/ckpt_3000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:51018e44f695f15948d2cbcd014d62113a7a82a67ca7ca25dc767a77c12ae563
|
| 3 |
+
size 69377274
|
2024-09-23/09-33-58/checkpoints/ckpt_4000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bf165a859555ddeb74ad0c7b6e10f17fa5f91c1b060a14bd77dd7fedbde5503c
|
| 3 |
+
size 69377274
|
2024-09-23/09-33-58/checkpoints/ckpt_5000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:99bdcae468dc981532ae56ecd8616824b1cf86801d364510be19a57467a81dbb
|
| 3 |
+
size 69377274
|