Delete
Browse files
lightning_logs/version_2/events.out.tfevents.1776329596.b85934d9bf78.46840.0
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:b0c7420a7b2e8a3090ded89e5d94f25a009c120e2d0f8a28877911c34cf0fc00
|
| 3 |
-
size 7206
|
|
|
|
|
|
|
|
|
|
|
|
lightning_logs/version_2/hparams.yaml
DELETED
|
@@ -1,89 +0,0 @@
|
|
| 1 |
-
batch_size: 112
|
| 2 |
-
epochs: 3
|
| 3 |
-
stage: 3
|
| 4 |
-
unfrozen_ratio: 0.3
|
| 5 |
-
ckpt_weights_only: true
|
| 6 |
-
checkpoint_dir: ./checkpoints/hydra_mark
|
| 7 |
-
train_data_dir: ./data/train_shards3
|
| 8 |
-
val_data_dir: ./data/val_shards
|
| 9 |
-
weights_path: ./models/hydra_hypernet_mark.pt
|
| 10 |
-
shuffle: true
|
| 11 |
-
use_early_stopping: false
|
| 12 |
-
max_patience_counter: 5
|
| 13 |
-
min_delta: 0.01
|
| 14 |
-
use_gradient_clipping: true
|
| 15 |
-
gradient_clipping_norm: 1.0
|
| 16 |
-
pad_length: 4096
|
| 17 |
-
learning_rate_mark: 0.0006
|
| 18 |
-
learning_rate_hydra: 3.0e-05
|
| 19 |
-
learning_rate_cls: 0.0001
|
| 20 |
-
no_cache: false
|
| 21 |
-
num_workers: 8
|
| 22 |
-
matmul_precision: high
|
| 23 |
-
multi_shot: false
|
| 24 |
-
intervals: 3
|
| 25 |
-
is_prenorm: false
|
| 26 |
-
accumulate_grad_batches: 2
|
| 27 |
-
cart: true
|
| 28 |
-
cart_p: 0.45
|
| 29 |
-
cart_scale: 1.0
|
| 30 |
-
distillation: false
|
| 31 |
-
lr_scheduler:
|
| 32 |
-
type: cosine
|
| 33 |
-
warmup_steps: 720
|
| 34 |
-
total_steps: 14400
|
| 35 |
-
min_lr_ratio: 0.1
|
| 36 |
-
polynomial:
|
| 37 |
-
end_lr_ratio: 0.0
|
| 38 |
-
power: 1.0
|
| 39 |
-
plateau:
|
| 40 |
-
factor: 0.5
|
| 41 |
-
patience: 3
|
| 42 |
-
min_lr: 1.0e-06
|
| 43 |
-
trainer:
|
| 44 |
-
accelerator: gpu
|
| 45 |
-
devices: -1
|
| 46 |
-
check_val_every_n_epoch: null
|
| 47 |
-
num_sanity_val_steps: 0
|
| 48 |
-
accumulate_grad_batches: 1
|
| 49 |
-
precision: bf16-true
|
| 50 |
-
enable_checkpointing: true
|
| 51 |
-
default_root_dir: ./checkpoints/hydra_mark
|
| 52 |
-
wandb:
|
| 53 |
-
project: hydra-training_hypernet
|
| 54 |
-
model_name: HydraForMaskedLM
|
| 55 |
-
watch_log: all
|
| 56 |
-
log_freq: 20
|
| 57 |
-
hydra_config:
|
| 58 |
-
hidden_size: 768
|
| 59 |
-
vocab_size: 30522
|
| 60 |
-
type_vocab_size: 2
|
| 61 |
-
pad_token_id: 0
|
| 62 |
-
use_position_embeddings: false
|
| 63 |
-
max_position_embeddings: 4096
|
| 64 |
-
use_timestep_embeddings: true
|
| 65 |
-
layer_norm_eps: 1.0e-12
|
| 66 |
-
dropout: 0.0
|
| 67 |
-
max_timestep_embeddings: 1000
|
| 68 |
-
current_timestep: 0
|
| 69 |
-
d_state: 64
|
| 70 |
-
d_conv: 7
|
| 71 |
-
head_dim: 64
|
| 72 |
-
expand: 2
|
| 73 |
-
chunk_size: 256
|
| 74 |
-
is_prenorm: false
|
| 75 |
-
use_eff_compute: false
|
| 76 |
-
gradient_checkpointing: true
|
| 77 |
-
num_hidden_layers: 23
|
| 78 |
-
guider_hidden_layers: 12
|
| 79 |
-
device: cpu
|
| 80 |
-
pool_all: false
|
| 81 |
-
mark_kernel: hypernet
|
| 82 |
-
mark_ensemble: false
|
| 83 |
-
rank: 2
|
| 84 |
-
degree: 5
|
| 85 |
-
L_timepoints: 256
|
| 86 |
-
n_freqs: 8
|
| 87 |
-
mark_mlp_dim: 256
|
| 88 |
-
hidden_act: swish
|
| 89 |
-
initializer_range: 0.02
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|