dylanzhou2 commited on
Commit
ef5c98d
·
1 Parent(s): 00056e9

Add model weights and config

Browse files
Files changed (3) hide show
  1. .gitattributes +1 -0
  2. config.yaml +182 -0
  3. model.pt +3 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ model.pt filter=lfs diff=lfs merge=lfs -text
config.yaml ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ run_name: olmo-multi-host-4x8-h100-ddp-198984688
2
+ seed: 6198
3
+ epoch: null
4
+ dry_run: false
5
+ model:
6
+ d_model: 2048
7
+ n_heads: 16
8
+ n_kv_heads: null
9
+ clip_qkv: null
10
+ n_layers: 16
11
+ mlp_ratio: 8
12
+ mlp_hidden_size: null
13
+ activation_type: swiglu
14
+ block_type: sequential
15
+ block_group_size: 1
16
+ alibi: false
17
+ alibi_bias_max: 8.0
18
+ rope: true
19
+ rope_full_precision: true
20
+ rope_theta: 500000
21
+ flash_attention: true
22
+ attention_dropout: 0.0
23
+ multi_query_attention: null
24
+ attention_layer_norm: true
25
+ residual_dropout: 0.0
26
+ embedding_dropout: 0.0
27
+ embedding_layer_norm: false
28
+ layer_norm_type: rms
29
+ layer_norm_with_affine: true
30
+ layer_norm_eps: 1.0e-06
31
+ attention_layer_norm_with_affine: true
32
+ max_sequence_length: 4096
33
+ include_bias: false
34
+ bias_for_layer_norm: false
35
+ scale_logits: false
36
+ vocab_size: 50280
37
+ embedding_size: 50304
38
+ weight_tying: false
39
+ eos_token_id: 50279
40
+ pad_token_id: 50256
41
+ init_device: cuda
42
+ init_fn: normal
43
+ init_std: 0.02
44
+ init_cutoff_factor: 3.0
45
+ precision: amp_bf16
46
+ scale_emb_init: false
47
+ emb_init_std: null
48
+ norm_after: true
49
+ optimizer:
50
+ name: adamw
51
+ learning_rate: 0.0004
52
+ weight_decay: 0.1
53
+ betas:
54
+ - 0.9
55
+ - 0.95
56
+ eps: 1.0e-08
57
+ no_decay_norm_and_bias: null
58
+ selective_updates: false
59
+ decay_norm_and_bias: true
60
+ decay_embeddings: false
61
+ metrics_log_interval: 100
62
+ record_update_metrics: false
63
+ scheduler:
64
+ name: cosine_with_warmup
65
+ units: tokens
66
+ t_warmup: 2000
67
+ t_max: 5000000000000.0
68
+ alpha_f: 0.1
69
+ grad_clip_warmup_steps: null
70
+ grad_clip_warmup_factor: null
71
+ warmup_min_lr: 0.0
72
+ data:
73
+ paths:
74
+ - /dev/disk-scratch/fwe_manifest.txt
75
+ memmap_dtype: uint16
76
+ datasets: null
77
+ label_mask_paths: null
78
+ pad_direction: right
79
+ generate_attention_mask: false
80
+ generate_doc_lengths: false
81
+ num_workers: 2
82
+ drop_last: true
83
+ pin_memory: true
84
+ prefetch_factor: 2
85
+ persistent_workers: true
86
+ timeout: 0
87
+ seed: null
88
+ instance_filter:
89
+ repetition_max_period: 13
90
+ repetition_min_period: 1
91
+ repetition_max_count: 32
92
+ custom_dataset: null
93
+ restore_dataloader: true
94
+ fast_forward_batches: null
95
+ evaluators:
96
+ - label: openbook_qa
97
+ type: downstream
98
+ data:
99
+ paths: null
100
+ memmap_dtype: uint16
101
+ datasets: null
102
+ label_mask_paths: null
103
+ pad_direction: right
104
+ generate_attention_mask: false
105
+ generate_doc_lengths: false
106
+ num_workers: 0
107
+ drop_last: false
108
+ pin_memory: false
109
+ prefetch_factor: null
110
+ persistent_workers: false
111
+ timeout: 0
112
+ seed: null
113
+ instance_filter: null
114
+ custom_dataset: null
115
+ device_eval_batch_size: null
116
+ subset_num_batches: null
117
+ eval_interval: 100
118
+ tokenizer:
119
+ identifier: tokenizers/allenai_eleuther-ai-gpt-neox-20b-pii-special.json
120
+ truncate_direction: right
121
+ save_folder: /gcs/dylanzhou-73df477742b7/olmo/checkpoints/olmo-multi-host-4x8-h100-ddp-198984688
122
+ remote_save_folder: null
123
+ canceled_check_interval: 50
124
+ save_interval: 100
125
+ save_interval_unsharded: 200
126
+ save_interval_ephemeral: 100
127
+ save_num_checkpoints_to_keep: 100
128
+ save_num_unsharded_checkpoints_to_keep: 5
129
+ save_overwrite: true
130
+ force_save_unsharded: false
131
+ no_pre_train_checkpoint: true
132
+ load_path: null
133
+ load_path_sharded_checkpointer: null
134
+ try_load_latest_save: false
135
+ reset_optimizer_state: false
136
+ reset_trainer_state: false
137
+ sharded_checkpointer: torch_new
138
+ new_style_checkpoints: null
139
+ max_duration: 1e11T
140
+ global_train_batch_size: 512
141
+ device_train_batch_size: 16
142
+ device_train_microbatch_size: 2
143
+ device_eval_batch_size: 2
144
+ eval_subset_num_batches: -1
145
+ eval_on_load: false
146
+ device_train_grad_accum: 8
147
+ max_grad_norm: 1.0
148
+ max_grad_norm_ratio: null
149
+ precision: amp_bf16
150
+ wandb: null
151
+ speed_monitor:
152
+ window_size: 1
153
+ gpu_flops_available: null
154
+ console_log_interval: 500
155
+ gen1_gc_interval: 10
156
+ compile: null
157
+ distributed_strategy: ddp
158
+ fsdp:
159
+ use_orig_params: true
160
+ sharding_strategy: FULL_SHARD
161
+ wrapping_strategy: null
162
+ precision: pure
163
+ hybrid_sharding_num_model_replicas: null
164
+ ddp:
165
+ grad_sync_mode: batch
166
+ find_unused_params: false
167
+ single:
168
+ device: auto
169
+ softmax_auxiliary_loss: true
170
+ auxiliary_loss_multiplier: 1.0e-05
171
+ time_limit: null
172
+ extra_steps_after_cancel: 10
173
+ early_stopping_factor: null
174
+ save_data_indices: true
175
+ python_profiling: false
176
+ torch_profiling: false
177
+ stop_at: 47694
178
+ stop_after: null
179
+ activation_checkpointing: whole_layer
180
+ fused_loss: false
181
+ hf_datasets_cache_dir: null
182
+ module_outputs_save_steps: null
model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:84c19ebe644de504aeda9cd7eaa90381153d279811a3ad182bd865cb69e951bd
3
+ size 5119726359