File size: 5,549 Bytes
bd91d07
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
dataset_path: /pscratch/sd/b/binxia/supermock_dataset_11.2-14.json
input_errors:
- 0
- 0
- 0
- 0
- 0
- 0
- 0
mask_token: 0
masked_generation: false
masking_prob:
- 0.2
- 0.2
- 0.2
- 0.2
- 0.5
- 0.5
- 0.5
modalities:
- SFH
- SED
- mag_{band}_spherex
- mag_{band}_lsst
- redshift
- halo_mass
- stellar_mass
scalar_shape:
  redshift:
  - 20000
  - 1
  halo_mass:
  - 20000
  - 1
  stellar_mass:
  - 20000
  - 1
vector_shape:
  SFH:
  - 20000
  - 117
  SED:
  - 20000
  - 921
  mag_{band}_spherex:
  - 20000
  - 102
  mag_{band}_lsst:
  - 20000
  - 6
model_config:
  attention_probs_dropout_prob: 0.1
  classifier_dropout: 0.0
  contrastive_temperature: 0.05
  hidden_dropout_prob: 0.1
  hidden_size: 384
  intermediate_size: 3072
  loss_weights:
    contrastive:
      rounds: 0
      w0T:
      - 0
      - 0
    masked:
      rounds: 0
      w0T:
      - 0.8
      - 3
    smooth:
      rounds: 0
      w0T:
      - 0
      - 0.3
    unmasked:
      rounds: 0
      w0T:
      - 0.2
      - 0.3
  max_position_embeddings: 1149
  num_attention_heads: 12
  num_hidden_layers: 8
  pad_token_id: -1
  transform_numeric: false
  use_contrastive_loss: false
  use_mlm_loss: true
  use_regression_loss: false
  use_sdpa_attention: true
  use_xval_loss: false
  vocab_size: 2048
model_name_or_path: galaxybert
num_total_samples: -1
tokenizer_name_or_path: Salesforce/SFR-Embedding-Mistral
training_args:
  _n_gpu: 1
  accelerator_config:
    dispatch_batches: null
    even_batches: true
    gradient_accumulation_kwargs: null
    non_blocking: false
    split_batches: false
    use_configured_state: false
    use_seedable_sampler: true
  adafactor: false
  adam_beta1: 0.9
  adam_beta2: 0.999
  adam_epsilon: 1.0e-08
  auto_find_batch_size: false
  average_tokens_across_devices: true
  batch_eval_metrics: false
  bf16: true
  bf16_full_eval: false
  data_seed: null
  dataloader_drop_last: false
  dataloader_num_workers: 16
  dataloader_persistent_workers: false
  dataloader_pin_memory: true
  dataloader_prefetch_factor: 8
  ddp_backend: null
  ddp_broadcast_buffers: null
  ddp_bucket_cap_mb: null
  ddp_find_unused_parameters: null
  ddp_timeout: 1800
  debug: []
  deepspeed: null
  disable_tqdm: false
  do_eval: true
  do_predict: false
  do_train: false
  eval_accumulation_steps: 5
  eval_delay: 0
  eval_do_concat_batches: true
  eval_on_start: false
  eval_steps: 20
  eval_strategy: !!python/object/apply:transformers.trainer_utils.IntervalStrategy
  - steps
  eval_use_gather_object: false
  fp16: false
  fp16_backend: auto
  fp16_full_eval: false
  fp16_opt_level: O1
  fsdp: []
  fsdp_config:
    min_num_params: 0
    xla: false
    xla_fsdp_grad_ckpt: false
    xla_fsdp_v2: false
  fsdp_min_num_params: 0
  fsdp_transformer_layer_cls_to_wrap: null
  full_determinism: false
  gradient_accumulation_steps: 5
  gradient_checkpointing: false
  gradient_checkpointing_kwargs: null
  greater_is_better: null
  group_by_length: false
  half_precision_backend: auto
  hub_always_push: false
  hub_model_id: null
  hub_private_repo: null
  hub_revision: null
  hub_strategy: !!python/object/apply:transformers.trainer_utils.HubStrategy
  - every_save
  hub_token: null
  ignore_data_skip: false
  include_for_metrics: []
  include_inputs_for_metrics: false
  include_num_input_tokens_seen: 'no'
  include_tokens_per_second: false
  jit_mode_eval: false
  label_names: null
  label_smoothing_factor: 0.0
  learning_rate: 0.0001
  length_column_name: length
  liger_kernel_config: null
  load_best_model_at_end: false
  local_rank: 3
  log_level: passive
  log_level_replica: warning
  log_on_each_node: true
  logging_dir: sm_foundation_lg_gmm_nomasklab
  logging_first_step: true
  logging_nan_inf_filter: true
  logging_steps: 1
  logging_strategy: !!python/object/apply:transformers.trainer_utils.IntervalStrategy
  - steps
  lr_scheduler_kwargs: {}
  lr_scheduler_type: !!python/object/apply:transformers.trainer_utils.SchedulerType
  - cosine
  max_grad_norm: 1.0
  max_steps: -1
  metric_for_best_model: null
  mp_parameters: ''
  neftune_noise_alpha: null
  no_cuda: false
  num_train_epochs: 120
  optim: !!python/object/apply:transformers.training_args.OptimizerNames
  - adamw_torch
  optim_args: null
  optim_target_modules: null
  output_dir: supermock_light_nte120_nts-1
  overwrite_output_dir: true
  parallelism_config: null
  past_index: -1
  per_device_eval_batch_size: 40
  per_device_train_batch_size: 40
  per_gpu_eval_batch_size: null
  per_gpu_train_batch_size: null
  prediction_loss_only: false
  project: huggingface
  push_to_hub: false
  push_to_hub_model_id: null
  push_to_hub_organization: null
  push_to_hub_token: null
  ray_scope: last
  remove_unused_columns: false
  report_to:
  - wandb
  restore_callback_states_from_checkpoint: false
  resume_from_checkpoint: null
  run_name: NO_SHARD_b50
  save_on_each_node: false
  save_only_model: false
  save_safetensors: true
  save_steps: 30
  save_strategy: !!python/object/apply:transformers.trainer_utils.SaveStrategy
  - steps
  save_total_limit: 360
  seed: 42
  skip_memory_metrics: true
  tf32: null
  torch_compile: false
  torch_compile_backend: null
  torch_compile_mode: null
  torch_empty_cache_steps: null
  torchdynamo: null
  tpu_metrics_debug: false
  tpu_num_cores: null
  trackio_space_id: trackio
  use_cpu: false
  use_legacy_prediction_loop: false
  use_liger_kernel: false
  use_mps_device: false
  warmup_ratio: 0.0
  warmup_steps: 0
  weight_decay: 0.1
transform_numeric: false
wandb_project: supermock-foundation-perl
wandb_run_name: ''