codebyzeb commited on
Commit
2eaa5a7
·
verified ·
1 Parent(s): c9f3303

Upload folder using huggingface_hub

Browse files
llm/fw57M-multi-tied/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
llm/fw57M-multi-tied/README.md ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ {}
3
+ ---
4
+ ## Experiment Configuration
5
+ ```yaml
6
+ callbacks:
7
+ grad_accum:
8
+ _target_: src.callbacks.gradient_accumulation.GradientAccumulationScheduler
9
+ scheduling:
10
+ 0: 2
11
+ grad_norm:
12
+ _target_: src.callbacks.grad_norm.GradNorm
13
+ check_clipping: false
14
+ group_separator: /
15
+ histogram_freq: null
16
+ log_weight_distribution: false
17
+ norm_type: 2
18
+ only_total: true
19
+ lr_monitor:
20
+ _target_: src.callbacks.lr_monitor.SimpleLearningRateMonitor
21
+ model_checkpoint:
22
+ _target_: src.callbacks.model_checkpoint.ModelCheckpoint
23
+ dirpath: .checkpoints
24
+ enable_version_counter: false
25
+ every_n_train_steps: 1000
26
+ filename: '{step}'
27
+ save_initial_checkpoint: true
28
+ save_last: link
29
+ save_top_k: -1
30
+ verbose: true
31
+ speed_monitor:
32
+ _target_: src.callbacks.speed_monitor.SpeedMonitor
33
+ data:
34
+ batch_size: 64
35
+ drop_last: false
36
+ eval_batch_size: 64
37
+ multiprocessing_context: null
38
+ num_workers: 12
39
+ persistent_workers: false
40
+ pin_memory: true
41
+ prefetch_factor: 2
42
+ shuffle: true
43
+ dataset: common-corpus
44
+ evaluation:
45
+ blimp: false
46
+ loggers:
47
+ tensorboard:
48
+ _target_: src.trainer.TensorBoardLogger
49
+ name: ''
50
+ save_dir: ./
51
+ version: null
52
+ model: fw57M-tied
53
+ optim:
54
+ lr: 0.0006
55
+ num_warmup_steps: 2000
56
+ optim_kwargs:
57
+ betas:
58
+ - 0.9
59
+ - 0.95
60
+ eps: 1.0e-08
61
+ fused: true
62
+ optim_name: adamw
63
+ scheduler_kwargs:
64
+ min_lr_ratio: 0.01
65
+ num_decay_steps: 4000
66
+ num_stable_steps: 44000
67
+ scheduler_name: warmup_stable_decay
68
+ weight_decay: 0.01
69
+ out_parent_folder: model_train
70
+ pwd: /home/zg258/rds/hpc-work/infotokenization
71
+ resume_from_checkpoint: .checkpoints/last.ckpt
72
+ run_folder: .
73
+ save_initial_checkpoint: true
74
+ seed: 42
75
+ tok_name: bytelevel
76
+ torch_compile: true
77
+ train_data_path: /home/zg258/rds/hpc-work/infotokenization/data/common-corpus/bytelevel/train
78
+ trainer:
79
+ accelerator: gpu
80
+ deterministic: false
81
+ devices: 1
82
+ enable_progress_bar: true
83
+ fast_dev_run: false
84
+ gradient_clip_algorithm: norm
85
+ gradient_clip_val: 1.0
86
+ limit_val_batches: 500
87
+ log_every_n_steps: 1
88
+ max_steps: 50000
89
+ precision: bf16-true
90
+ val_check_interval: 1000
91
+ val_data_path: /home/zg258/rds/hpc-work/infotokenization/data/common-corpus/bytelevel/validation
92
+ ```
llm/fw57M-multi-tied/hparams.yaml ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ loggers:
2
+ tensorboard:
3
+ _target_: src.trainer.TensorBoardLogger
4
+ save_dir: ./
5
+ name: ''
6
+ version: null
7
+ callbacks:
8
+ lr_monitor:
9
+ _target_: src.callbacks.lr_monitor.SimpleLearningRateMonitor
10
+ grad_norm:
11
+ _target_: src.callbacks.grad_norm.GradNorm
12
+ norm_type: 2
13
+ group_separator: /
14
+ histogram_freq: null
15
+ check_clipping: false
16
+ log_weight_distribution: false
17
+ only_total: true
18
+ speed_monitor:
19
+ _target_: src.callbacks.speed_monitor.SpeedMonitor
20
+ grad_accum:
21
+ _target_: src.callbacks.gradient_accumulation.GradientAccumulationScheduler
22
+ scheduling:
23
+ 0: 2
24
+ model_checkpoint:
25
+ _target_: src.callbacks.model_checkpoint.ModelCheckpoint
26
+ dirpath: .checkpoints
27
+ filename: '{step}'
28
+ enable_version_counter: false
29
+ every_n_train_steps: 1000
30
+ save_top_k: -1
31
+ save_last: link
32
+ verbose: true
33
+ save_initial_checkpoint: true
34
+ out_parent_folder: model_train
35
+ tok_name: bytelevel
36
+ run_folder: .
37
+ dataset: common-corpus
38
+ pwd: /home/zg258/rds/hpc-work/infotokenization
39
+ train_data_path: /home/zg258/rds/hpc-work/infotokenization/data/common-corpus/bytelevel/train
40
+ val_data_path: /home/zg258/rds/hpc-work/infotokenization/data/common-corpus/bytelevel/validation
41
+ model: fw57M-tied
42
+ resume_from_checkpoint: .checkpoints/last.ckpt
43
+ save_initial_checkpoint: true
44
+ seed: 42
45
+ torch_compile: true
46
+ data:
47
+ batch_size: 64
48
+ eval_batch_size: 64
49
+ shuffle: true
50
+ drop_last: false
51
+ num_workers: 12
52
+ pin_memory: true
53
+ persistent_workers: false
54
+ prefetch_factor: 2
55
+ multiprocessing_context: null
56
+ optim:
57
+ optim_name: adamw
58
+ lr: 0.0006
59
+ weight_decay: 0.01
60
+ optim_kwargs:
61
+ fused: true
62
+ eps: 1.0e-08
63
+ betas:
64
+ - 0.9
65
+ - 0.95
66
+ scheduler_name: warmup_stable_decay
67
+ num_warmup_steps: 2000
68
+ scheduler_kwargs:
69
+ num_stable_steps: 44000
70
+ num_decay_steps: 4000
71
+ min_lr_ratio: 0.01
72
+ trainer:
73
+ accelerator: gpu
74
+ devices: 1
75
+ precision: bf16-true
76
+ deterministic: false
77
+ log_every_n_steps: 1
78
+ enable_progress_bar: true
79
+ fast_dev_run: false
80
+ gradient_clip_val: 1.0
81
+ gradient_clip_algorithm: norm
82
+ val_check_interval: 1000
83
+ max_steps: 50000
84
+ limit_val_batches: 500
85
+ evaluation:
86
+ blimp: false
llm/fw57M-multi-tied/special_tokens_map.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eos_token": {
3
+ "content": "<|endoftext|>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "pad_token": {
10
+ "content": "<|padding|>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ }
16
+ }