ibitec commited on
Commit
df2d8c3
·
verified ·
1 Parent(s): 2969325
lightning_logs/version_2/events.out.tfevents.1776329596.b85934d9bf78.46840.0 DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:b0c7420a7b2e8a3090ded89e5d94f25a009c120e2d0f8a28877911c34cf0fc00
3
- size 7206
 
 
 
 
lightning_logs/version_2/hparams.yaml DELETED
@@ -1,89 +0,0 @@
1
- batch_size: 112
2
- epochs: 3
3
- stage: 3
4
- unfrozen_ratio: 0.3
5
- ckpt_weights_only: true
6
- checkpoint_dir: ./checkpoints/hydra_mark
7
- train_data_dir: ./data/train_shards3
8
- val_data_dir: ./data/val_shards
9
- weights_path: ./models/hydra_hypernet_mark.pt
10
- shuffle: true
11
- use_early_stopping: false
12
- max_patience_counter: 5
13
- min_delta: 0.01
14
- use_gradient_clipping: true
15
- gradient_clipping_norm: 1.0
16
- pad_length: 4096
17
- learning_rate_mark: 0.0006
18
- learning_rate_hydra: 3.0e-05
19
- learning_rate_cls: 0.0001
20
- no_cache: false
21
- num_workers: 8
22
- matmul_precision: high
23
- multi_shot: false
24
- intervals: 3
25
- is_prenorm: false
26
- accumulate_grad_batches: 2
27
- cart: true
28
- cart_p: 0.45
29
- cart_scale: 1.0
30
- distillation: false
31
- lr_scheduler:
32
- type: cosine
33
- warmup_steps: 720
34
- total_steps: 14400
35
- min_lr_ratio: 0.1
36
- polynomial:
37
- end_lr_ratio: 0.0
38
- power: 1.0
39
- plateau:
40
- factor: 0.5
41
- patience: 3
42
- min_lr: 1.0e-06
43
- trainer:
44
- accelerator: gpu
45
- devices: -1
46
- check_val_every_n_epoch: null
47
- num_sanity_val_steps: 0
48
- accumulate_grad_batches: 1
49
- precision: bf16-true
50
- enable_checkpointing: true
51
- default_root_dir: ./checkpoints/hydra_mark
52
- wandb:
53
- project: hydra-training_hypernet
54
- model_name: HydraForMaskedLM
55
- watch_log: all
56
- log_freq: 20
57
- hydra_config:
58
- hidden_size: 768
59
- vocab_size: 30522
60
- type_vocab_size: 2
61
- pad_token_id: 0
62
- use_position_embeddings: false
63
- max_position_embeddings: 4096
64
- use_timestep_embeddings: true
65
- layer_norm_eps: 1.0e-12
66
- dropout: 0.0
67
- max_timestep_embeddings: 1000
68
- current_timestep: 0
69
- d_state: 64
70
- d_conv: 7
71
- head_dim: 64
72
- expand: 2
73
- chunk_size: 256
74
- is_prenorm: false
75
- use_eff_compute: false
76
- gradient_checkpointing: true
77
- num_hidden_layers: 23
78
- guider_hidden_layers: 12
79
- device: cpu
80
- pool_all: false
81
- mark_kernel: hypernet
82
- mark_ensemble: false
83
- rank: 2
84
- degree: 5
85
- L_timepoints: 256
86
- n_freqs: 8
87
- mark_mlp_dim: 256
88
- hidden_act: swish
89
- initializer_range: 0.02