Upload folder using huggingface_hub
Browse files- best_hydra_mark-v1.ckpt +3 -0
- best_hydra_mark-v2.ckpt +3 -0
- best_hydra_mark.ckpt +3 -0
- lightning_logs/version_0/events.out.tfevents.1775782985.b85934d9bf78.17981.0 +3 -0
- lightning_logs/version_0/hparams.yaml +89 -0
- lightning_logs/version_1/events.out.tfevents.1776275720.b85934d9bf78.42893.0 +3 -0
- lightning_logs/version_1/hparams.yaml +89 -0
- lightning_logs/version_2/events.out.tfevents.1776329596.b85934d9bf78.46840.0 +3 -0
- lightning_logs/version_2/hparams.yaml +89 -0
best_hydra_mark-v1.ckpt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:43949d03c1052bd9e9a3be70f31db69c4d22040b95bf2c8caf919198755cb964
|
| 3 |
+
size 367591404
|
best_hydra_mark-v2.ckpt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c17b685f26c1e2aa2f71da30f0de86595872a570ffcdb2202b180349d5d6ff0f
|
| 3 |
+
size 238130643
|
best_hydra_mark.ckpt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:69dcf929e47fb76a0e94bc901fcb5e0053b91470589923f47278dc397493229f
|
| 3 |
+
size 261410003
|
lightning_logs/version_0/events.out.tfevents.1775782985.b85934d9bf78.17981.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6d0a3291cdf35419efc6c4a90b33b8b444f7b286375678372b7e614a901a8c47
|
| 3 |
+
size 19848
|
lightning_logs/version_0/hparams.yaml
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
batch_size: 112
|
| 2 |
+
epochs: 3
|
| 3 |
+
stage: 1
|
| 4 |
+
unfrozen_ratio: 0.3
|
| 5 |
+
ckpt_weights_only: false
|
| 6 |
+
checkpoint_dir: ./checkpoints/hydra_mark
|
| 7 |
+
train_data_dir: ./data/train_shards1
|
| 8 |
+
val_data_dir: ./data/val_shards
|
| 9 |
+
weights_path: ./models/hydra_hypernet_mark.pt
|
| 10 |
+
shuffle: true
|
| 11 |
+
use_early_stopping: false
|
| 12 |
+
max_patience_counter: 5
|
| 13 |
+
min_delta: 0.01
|
| 14 |
+
use_gradient_clipping: true
|
| 15 |
+
gradient_clipping_norm: 1.0
|
| 16 |
+
pad_length: 4096
|
| 17 |
+
learning_rate_mark: 0.0006
|
| 18 |
+
learning_rate_hydra: 3.0e-05
|
| 19 |
+
learning_rate_cls: 0.0001
|
| 20 |
+
no_cache: false
|
| 21 |
+
num_workers: 8
|
| 22 |
+
matmul_precision: high
|
| 23 |
+
multi_shot: false
|
| 24 |
+
intervals: 3
|
| 25 |
+
is_prenorm: false
|
| 26 |
+
accumulate_grad_batches: 2
|
| 27 |
+
cart: true
|
| 28 |
+
cart_p: 0.45
|
| 29 |
+
cart_scale: 1.0
|
| 30 |
+
distillation: false
|
| 31 |
+
lr_scheduler:
|
| 32 |
+
type: cosine
|
| 33 |
+
warmup_steps: 720
|
| 34 |
+
total_steps: 14400
|
| 35 |
+
min_lr_ratio: 0.1
|
| 36 |
+
polynomial:
|
| 37 |
+
end_lr_ratio: 0.0
|
| 38 |
+
power: 1.0
|
| 39 |
+
plateau:
|
| 40 |
+
factor: 0.5
|
| 41 |
+
patience: 3
|
| 42 |
+
min_lr: 1.0e-06
|
| 43 |
+
trainer:
|
| 44 |
+
accelerator: gpu
|
| 45 |
+
devices: -1
|
| 46 |
+
check_val_every_n_epoch: null
|
| 47 |
+
num_sanity_val_steps: 0
|
| 48 |
+
accumulate_grad_batches: 1
|
| 49 |
+
precision: bf16-true
|
| 50 |
+
enable_checkpointing: true
|
| 51 |
+
default_root_dir: ./checkpoints/hydra_mark
|
| 52 |
+
wandb:
|
| 53 |
+
project: hydra-training_hypernet
|
| 54 |
+
model_name: HydraForMaskedLM
|
| 55 |
+
watch_log: all
|
| 56 |
+
log_freq: 20
|
| 57 |
+
hydra_config:
|
| 58 |
+
hidden_size: 768
|
| 59 |
+
vocab_size: 30522
|
| 60 |
+
type_vocab_size: 2
|
| 61 |
+
pad_token_id: 0
|
| 62 |
+
use_position_embeddings: false
|
| 63 |
+
max_position_embeddings: 4096
|
| 64 |
+
use_timestep_embeddings: true
|
| 65 |
+
layer_norm_eps: 1.0e-12
|
| 66 |
+
dropout: 0.0
|
| 67 |
+
max_timestep_embeddings: 1000
|
| 68 |
+
current_timestep: 0
|
| 69 |
+
d_state: 64
|
| 70 |
+
d_conv: 7
|
| 71 |
+
head_dim: 64
|
| 72 |
+
expand: 2
|
| 73 |
+
chunk_size: 256
|
| 74 |
+
is_prenorm: false
|
| 75 |
+
use_eff_compute: false
|
| 76 |
+
gradient_checkpointing: true
|
| 77 |
+
num_hidden_layers: 23
|
| 78 |
+
guider_hidden_layers: 12
|
| 79 |
+
device: cpu
|
| 80 |
+
pool_all: false
|
| 81 |
+
mark_kernel: hypernet
|
| 82 |
+
mark_ensemble: false
|
| 83 |
+
rank: 2
|
| 84 |
+
degree: 5
|
| 85 |
+
L_timepoints: 256
|
| 86 |
+
n_freqs: 8
|
| 87 |
+
mark_mlp_dim: 256
|
| 88 |
+
hidden_act: swish
|
| 89 |
+
initializer_range: 0.02
|
lightning_logs/version_1/events.out.tfevents.1776275720.b85934d9bf78.42893.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8075de978b2476667852f5875cc2ad8d55f6be6d3806729ac9e442a7db4f039a
|
| 3 |
+
size 20236
|
lightning_logs/version_1/hparams.yaml
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
batch_size: 112
|
| 2 |
+
epochs: 5
|
| 3 |
+
stage: 2
|
| 4 |
+
unfrozen_ratio: 0.3
|
| 5 |
+
ckpt_weights_only: true
|
| 6 |
+
checkpoint_dir: ./checkpoints/hydra_mark
|
| 7 |
+
train_data_dir: ./data/train_shards2
|
| 8 |
+
val_data_dir: ./data/val_shards
|
| 9 |
+
weights_path: ./models/hydra_hypernet_mark.pt
|
| 10 |
+
shuffle: true
|
| 11 |
+
use_early_stopping: false
|
| 12 |
+
max_patience_counter: 5
|
| 13 |
+
min_delta: 0.01
|
| 14 |
+
use_gradient_clipping: true
|
| 15 |
+
gradient_clipping_norm: 1.0
|
| 16 |
+
pad_length: 4096
|
| 17 |
+
learning_rate_mark: 0.0006
|
| 18 |
+
learning_rate_hydra: 3.0e-05
|
| 19 |
+
learning_rate_cls: 0.0001
|
| 20 |
+
no_cache: false
|
| 21 |
+
num_workers: 8
|
| 22 |
+
matmul_precision: high
|
| 23 |
+
multi_shot: false
|
| 24 |
+
intervals: 3
|
| 25 |
+
is_prenorm: false
|
| 26 |
+
accumulate_grad_batches: 2
|
| 27 |
+
cart: true
|
| 28 |
+
cart_p: 0.45
|
| 29 |
+
cart_scale: 1.0
|
| 30 |
+
distillation: false
|
| 31 |
+
lr_scheduler:
|
| 32 |
+
type: cosine
|
| 33 |
+
warmup_steps: 720
|
| 34 |
+
total_steps: 14400
|
| 35 |
+
min_lr_ratio: 0.1
|
| 36 |
+
polynomial:
|
| 37 |
+
end_lr_ratio: 0.0
|
| 38 |
+
power: 1.0
|
| 39 |
+
plateau:
|
| 40 |
+
factor: 0.5
|
| 41 |
+
patience: 3
|
| 42 |
+
min_lr: 1.0e-06
|
| 43 |
+
trainer:
|
| 44 |
+
accelerator: gpu
|
| 45 |
+
devices: -1
|
| 46 |
+
check_val_every_n_epoch: null
|
| 47 |
+
num_sanity_val_steps: 0
|
| 48 |
+
accumulate_grad_batches: 1
|
| 49 |
+
precision: bf16-true
|
| 50 |
+
enable_checkpointing: true
|
| 51 |
+
default_root_dir: ./checkpoints/hydra_mark
|
| 52 |
+
wandb:
|
| 53 |
+
project: hydra-training_hypernet
|
| 54 |
+
model_name: HydraForMaskedLM
|
| 55 |
+
watch_log: all
|
| 56 |
+
log_freq: 20
|
| 57 |
+
hydra_config:
|
| 58 |
+
hidden_size: 768
|
| 59 |
+
vocab_size: 30522
|
| 60 |
+
type_vocab_size: 2
|
| 61 |
+
pad_token_id: 0
|
| 62 |
+
use_position_embeddings: false
|
| 63 |
+
max_position_embeddings: 4096
|
| 64 |
+
use_timestep_embeddings: true
|
| 65 |
+
layer_norm_eps: 1.0e-12
|
| 66 |
+
dropout: 0.0
|
| 67 |
+
max_timestep_embeddings: 1000
|
| 68 |
+
current_timestep: 0
|
| 69 |
+
d_state: 64
|
| 70 |
+
d_conv: 7
|
| 71 |
+
head_dim: 64
|
| 72 |
+
expand: 2
|
| 73 |
+
chunk_size: 256
|
| 74 |
+
is_prenorm: false
|
| 75 |
+
use_eff_compute: false
|
| 76 |
+
gradient_checkpointing: true
|
| 77 |
+
num_hidden_layers: 23
|
| 78 |
+
guider_hidden_layers: 12
|
| 79 |
+
device: cpu
|
| 80 |
+
pool_all: false
|
| 81 |
+
mark_kernel: hypernet
|
| 82 |
+
mark_ensemble: false
|
| 83 |
+
rank: 2
|
| 84 |
+
degree: 5
|
| 85 |
+
L_timepoints: 256
|
| 86 |
+
n_freqs: 8
|
| 87 |
+
mark_mlp_dim: 256
|
| 88 |
+
hidden_act: swish
|
| 89 |
+
initializer_range: 0.02
|
lightning_logs/version_2/events.out.tfevents.1776329596.b85934d9bf78.46840.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b0c7420a7b2e8a3090ded89e5d94f25a009c120e2d0f8a28877911c34cf0fc00
|
| 3 |
+
size 7206
|
lightning_logs/version_2/hparams.yaml
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
batch_size: 112
|
| 2 |
+
epochs: 3
|
| 3 |
+
stage: 3
|
| 4 |
+
unfrozen_ratio: 0.3
|
| 5 |
+
ckpt_weights_only: true
|
| 6 |
+
checkpoint_dir: ./checkpoints/hydra_mark
|
| 7 |
+
train_data_dir: ./data/train_shards3
|
| 8 |
+
val_data_dir: ./data/val_shards
|
| 9 |
+
weights_path: ./models/hydra_hypernet_mark.pt
|
| 10 |
+
shuffle: true
|
| 11 |
+
use_early_stopping: false
|
| 12 |
+
max_patience_counter: 5
|
| 13 |
+
min_delta: 0.01
|
| 14 |
+
use_gradient_clipping: true
|
| 15 |
+
gradient_clipping_norm: 1.0
|
| 16 |
+
pad_length: 4096
|
| 17 |
+
learning_rate_mark: 0.0006
|
| 18 |
+
learning_rate_hydra: 3.0e-05
|
| 19 |
+
learning_rate_cls: 0.0001
|
| 20 |
+
no_cache: false
|
| 21 |
+
num_workers: 8
|
| 22 |
+
matmul_precision: high
|
| 23 |
+
multi_shot: false
|
| 24 |
+
intervals: 3
|
| 25 |
+
is_prenorm: false
|
| 26 |
+
accumulate_grad_batches: 2
|
| 27 |
+
cart: true
|
| 28 |
+
cart_p: 0.45
|
| 29 |
+
cart_scale: 1.0
|
| 30 |
+
distillation: false
|
| 31 |
+
lr_scheduler:
|
| 32 |
+
type: cosine
|
| 33 |
+
warmup_steps: 720
|
| 34 |
+
total_steps: 14400
|
| 35 |
+
min_lr_ratio: 0.1
|
| 36 |
+
polynomial:
|
| 37 |
+
end_lr_ratio: 0.0
|
| 38 |
+
power: 1.0
|
| 39 |
+
plateau:
|
| 40 |
+
factor: 0.5
|
| 41 |
+
patience: 3
|
| 42 |
+
min_lr: 1.0e-06
|
| 43 |
+
trainer:
|
| 44 |
+
accelerator: gpu
|
| 45 |
+
devices: -1
|
| 46 |
+
check_val_every_n_epoch: null
|
| 47 |
+
num_sanity_val_steps: 0
|
| 48 |
+
accumulate_grad_batches: 1
|
| 49 |
+
precision: bf16-true
|
| 50 |
+
enable_checkpointing: true
|
| 51 |
+
default_root_dir: ./checkpoints/hydra_mark
|
| 52 |
+
wandb:
|
| 53 |
+
project: hydra-training_hypernet
|
| 54 |
+
model_name: HydraForMaskedLM
|
| 55 |
+
watch_log: all
|
| 56 |
+
log_freq: 20
|
| 57 |
+
hydra_config:
|
| 58 |
+
hidden_size: 768
|
| 59 |
+
vocab_size: 30522
|
| 60 |
+
type_vocab_size: 2
|
| 61 |
+
pad_token_id: 0
|
| 62 |
+
use_position_embeddings: false
|
| 63 |
+
max_position_embeddings: 4096
|
| 64 |
+
use_timestep_embeddings: true
|
| 65 |
+
layer_norm_eps: 1.0e-12
|
| 66 |
+
dropout: 0.0
|
| 67 |
+
max_timestep_embeddings: 1000
|
| 68 |
+
current_timestep: 0
|
| 69 |
+
d_state: 64
|
| 70 |
+
d_conv: 7
|
| 71 |
+
head_dim: 64
|
| 72 |
+
expand: 2
|
| 73 |
+
chunk_size: 256
|
| 74 |
+
is_prenorm: false
|
| 75 |
+
use_eff_compute: false
|
| 76 |
+
gradient_checkpointing: true
|
| 77 |
+
num_hidden_layers: 23
|
| 78 |
+
guider_hidden_layers: 12
|
| 79 |
+
device: cpu
|
| 80 |
+
pool_all: false
|
| 81 |
+
mark_kernel: hypernet
|
| 82 |
+
mark_ensemble: false
|
| 83 |
+
rank: 2
|
| 84 |
+
degree: 5
|
| 85 |
+
L_timepoints: 256
|
| 86 |
+
n_freqs: 8
|
| 87 |
+
mark_mlp_dim: 256
|
| 88 |
+
hidden_act: swish
|
| 89 |
+
initializer_range: 0.02
|