ibitec commited on
Commit
08e4228
·
verified ·
1 Parent(s): d6ed112

Upload folder using huggingface_hub

Browse files
best_hydra_mark-v1.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:43949d03c1052bd9e9a3be70f31db69c4d22040b95bf2c8caf919198755cb964
3
+ size 367591404
best_hydra_mark-v2.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c17b685f26c1e2aa2f71da30f0de86595872a570ffcdb2202b180349d5d6ff0f
3
+ size 238130643
best_hydra_mark.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:69dcf929e47fb76a0e94bc901fcb5e0053b91470589923f47278dc397493229f
3
+ size 261410003
lightning_logs/version_0/events.out.tfevents.1775782985.b85934d9bf78.17981.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6d0a3291cdf35419efc6c4a90b33b8b444f7b286375678372b7e614a901a8c47
3
+ size 19848
lightning_logs/version_0/hparams.yaml ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ batch_size: 112
2
+ epochs: 3
3
+ stage: 1
4
+ unfrozen_ratio: 0.3
5
+ ckpt_weights_only: false
6
+ checkpoint_dir: ./checkpoints/hydra_mark
7
+ train_data_dir: ./data/train_shards1
8
+ val_data_dir: ./data/val_shards
9
+ weights_path: ./models/hydra_hypernet_mark.pt
10
+ shuffle: true
11
+ use_early_stopping: false
12
+ max_patience_counter: 5
13
+ min_delta: 0.01
14
+ use_gradient_clipping: true
15
+ gradient_clipping_norm: 1.0
16
+ pad_length: 4096
17
+ learning_rate_mark: 0.0006
18
+ learning_rate_hydra: 3.0e-05
19
+ learning_rate_cls: 0.0001
20
+ no_cache: false
21
+ num_workers: 8
22
+ matmul_precision: high
23
+ multi_shot: false
24
+ intervals: 3
25
+ is_prenorm: false
26
+ accumulate_grad_batches: 2
27
+ cart: true
28
+ cart_p: 0.45
29
+ cart_scale: 1.0
30
+ distillation: false
31
+ lr_scheduler:
32
+ type: cosine
33
+ warmup_steps: 720
34
+ total_steps: 14400
35
+ min_lr_ratio: 0.1
36
+ polynomial:
37
+ end_lr_ratio: 0.0
38
+ power: 1.0
39
+ plateau:
40
+ factor: 0.5
41
+ patience: 3
42
+ min_lr: 1.0e-06
43
+ trainer:
44
+ accelerator: gpu
45
+ devices: -1
46
+ check_val_every_n_epoch: null
47
+ num_sanity_val_steps: 0
48
+ accumulate_grad_batches: 1
49
+ precision: bf16-true
50
+ enable_checkpointing: true
51
+ default_root_dir: ./checkpoints/hydra_mark
52
+ wandb:
53
+ project: hydra-training_hypernet
54
+ model_name: HydraForMaskedLM
55
+ watch_log: all
56
+ log_freq: 20
57
+ hydra_config:
58
+ hidden_size: 768
59
+ vocab_size: 30522
60
+ type_vocab_size: 2
61
+ pad_token_id: 0
62
+ use_position_embeddings: false
63
+ max_position_embeddings: 4096
64
+ use_timestep_embeddings: true
65
+ layer_norm_eps: 1.0e-12
66
+ dropout: 0.0
67
+ max_timestep_embeddings: 1000
68
+ current_timestep: 0
69
+ d_state: 64
70
+ d_conv: 7
71
+ head_dim: 64
72
+ expand: 2
73
+ chunk_size: 256
74
+ is_prenorm: false
75
+ use_eff_compute: false
76
+ gradient_checkpointing: true
77
+ num_hidden_layers: 23
78
+ guider_hidden_layers: 12
79
+ device: cpu
80
+ pool_all: false
81
+ mark_kernel: hypernet
82
+ mark_ensemble: false
83
+ rank: 2
84
+ degree: 5
85
+ L_timepoints: 256
86
+ n_freqs: 8
87
+ mark_mlp_dim: 256
88
+ hidden_act: swish
89
+ initializer_range: 0.02
lightning_logs/version_1/events.out.tfevents.1776275720.b85934d9bf78.42893.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8075de978b2476667852f5875cc2ad8d55f6be6d3806729ac9e442a7db4f039a
3
+ size 20236
lightning_logs/version_1/hparams.yaml ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ batch_size: 112
2
+ epochs: 5
3
+ stage: 2
4
+ unfrozen_ratio: 0.3
5
+ ckpt_weights_only: true
6
+ checkpoint_dir: ./checkpoints/hydra_mark
7
+ train_data_dir: ./data/train_shards2
8
+ val_data_dir: ./data/val_shards
9
+ weights_path: ./models/hydra_hypernet_mark.pt
10
+ shuffle: true
11
+ use_early_stopping: false
12
+ max_patience_counter: 5
13
+ min_delta: 0.01
14
+ use_gradient_clipping: true
15
+ gradient_clipping_norm: 1.0
16
+ pad_length: 4096
17
+ learning_rate_mark: 0.0006
18
+ learning_rate_hydra: 3.0e-05
19
+ learning_rate_cls: 0.0001
20
+ no_cache: false
21
+ num_workers: 8
22
+ matmul_precision: high
23
+ multi_shot: false
24
+ intervals: 3
25
+ is_prenorm: false
26
+ accumulate_grad_batches: 2
27
+ cart: true
28
+ cart_p: 0.45
29
+ cart_scale: 1.0
30
+ distillation: false
31
+ lr_scheduler:
32
+ type: cosine
33
+ warmup_steps: 720
34
+ total_steps: 14400
35
+ min_lr_ratio: 0.1
36
+ polynomial:
37
+ end_lr_ratio: 0.0
38
+ power: 1.0
39
+ plateau:
40
+ factor: 0.5
41
+ patience: 3
42
+ min_lr: 1.0e-06
43
+ trainer:
44
+ accelerator: gpu
45
+ devices: -1
46
+ check_val_every_n_epoch: null
47
+ num_sanity_val_steps: 0
48
+ accumulate_grad_batches: 1
49
+ precision: bf16-true
50
+ enable_checkpointing: true
51
+ default_root_dir: ./checkpoints/hydra_mark
52
+ wandb:
53
+ project: hydra-training_hypernet
54
+ model_name: HydraForMaskedLM
55
+ watch_log: all
56
+ log_freq: 20
57
+ hydra_config:
58
+ hidden_size: 768
59
+ vocab_size: 30522
60
+ type_vocab_size: 2
61
+ pad_token_id: 0
62
+ use_position_embeddings: false
63
+ max_position_embeddings: 4096
64
+ use_timestep_embeddings: true
65
+ layer_norm_eps: 1.0e-12
66
+ dropout: 0.0
67
+ max_timestep_embeddings: 1000
68
+ current_timestep: 0
69
+ d_state: 64
70
+ d_conv: 7
71
+ head_dim: 64
72
+ expand: 2
73
+ chunk_size: 256
74
+ is_prenorm: false
75
+ use_eff_compute: false
76
+ gradient_checkpointing: true
77
+ num_hidden_layers: 23
78
+ guider_hidden_layers: 12
79
+ device: cpu
80
+ pool_all: false
81
+ mark_kernel: hypernet
82
+ mark_ensemble: false
83
+ rank: 2
84
+ degree: 5
85
+ L_timepoints: 256
86
+ n_freqs: 8
87
+ mark_mlp_dim: 256
88
+ hidden_act: swish
89
+ initializer_range: 0.02
lightning_logs/version_2/events.out.tfevents.1776329596.b85934d9bf78.46840.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b0c7420a7b2e8a3090ded89e5d94f25a009c120e2d0f8a28877911c34cf0fc00
3
+ size 7206
lightning_logs/version_2/hparams.yaml ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ batch_size: 112
2
+ epochs: 3
3
+ stage: 3
4
+ unfrozen_ratio: 0.3
5
+ ckpt_weights_only: true
6
+ checkpoint_dir: ./checkpoints/hydra_mark
7
+ train_data_dir: ./data/train_shards3
8
+ val_data_dir: ./data/val_shards
9
+ weights_path: ./models/hydra_hypernet_mark.pt
10
+ shuffle: true
11
+ use_early_stopping: false
12
+ max_patience_counter: 5
13
+ min_delta: 0.01
14
+ use_gradient_clipping: true
15
+ gradient_clipping_norm: 1.0
16
+ pad_length: 4096
17
+ learning_rate_mark: 0.0006
18
+ learning_rate_hydra: 3.0e-05
19
+ learning_rate_cls: 0.0001
20
+ no_cache: false
21
+ num_workers: 8
22
+ matmul_precision: high
23
+ multi_shot: false
24
+ intervals: 3
25
+ is_prenorm: false
26
+ accumulate_grad_batches: 2
27
+ cart: true
28
+ cart_p: 0.45
29
+ cart_scale: 1.0
30
+ distillation: false
31
+ lr_scheduler:
32
+ type: cosine
33
+ warmup_steps: 720
34
+ total_steps: 14400
35
+ min_lr_ratio: 0.1
36
+ polynomial:
37
+ end_lr_ratio: 0.0
38
+ power: 1.0
39
+ plateau:
40
+ factor: 0.5
41
+ patience: 3
42
+ min_lr: 1.0e-06
43
+ trainer:
44
+ accelerator: gpu
45
+ devices: -1
46
+ check_val_every_n_epoch: null
47
+ num_sanity_val_steps: 0
48
+ accumulate_grad_batches: 1
49
+ precision: bf16-true
50
+ enable_checkpointing: true
51
+ default_root_dir: ./checkpoints/hydra_mark
52
+ wandb:
53
+ project: hydra-training_hypernet
54
+ model_name: HydraForMaskedLM
55
+ watch_log: all
56
+ log_freq: 20
57
+ hydra_config:
58
+ hidden_size: 768
59
+ vocab_size: 30522
60
+ type_vocab_size: 2
61
+ pad_token_id: 0
62
+ use_position_embeddings: false
63
+ max_position_embeddings: 4096
64
+ use_timestep_embeddings: true
65
+ layer_norm_eps: 1.0e-12
66
+ dropout: 0.0
67
+ max_timestep_embeddings: 1000
68
+ current_timestep: 0
69
+ d_state: 64
70
+ d_conv: 7
71
+ head_dim: 64
72
+ expand: 2
73
+ chunk_size: 256
74
+ is_prenorm: false
75
+ use_eff_compute: false
76
+ gradient_checkpointing: true
77
+ num_hidden_layers: 23
78
+ guider_hidden_layers: 12
79
+ device: cpu
80
+ pool_all: false
81
+ mark_kernel: hypernet
82
+ mark_ensemble: false
83
+ rank: 2
84
+ degree: 5
85
+ L_timepoints: 256
86
+ n_freqs: 8
87
+ mark_mlp_dim: 256
88
+ hidden_act: swish
89
+ initializer_range: 0.02