File size: 5,549 Bytes

bd91d07

dataset_path: /pscratch/sd/b/binxia/supermock_dataset_11.2-14.json
input_errors:
- 0
- 0
- 0
- 0
- 0
- 0
- 0
mask_token: 0
masked_generation: false
masking_prob:
- 0.2
- 0.2
- 0.2
- 0.2
- 0.5
- 0.5
- 0.5
modalities:
- SFH
- SED
- mag_{band}_spherex
- mag_{band}_lsst
- redshift
- halo_mass
- stellar_mass
scalar_shape:
  redshift:
  - 20000
  - 1
  halo_mass:
  - 20000
  - 1
  stellar_mass:
  - 20000
  - 1
vector_shape:
  SFH:
  - 20000
  - 117
  SED:
  - 20000
  - 921
  mag_{band}_spherex:
  - 20000
  - 102
  mag_{band}_lsst:
  - 20000
  - 6
model_config:
  attention_probs_dropout_prob: 0.1
  classifier_dropout: 0.0
  contrastive_temperature: 0.05
  hidden_dropout_prob: 0.1
  hidden_size: 384
  intermediate_size: 3072
  loss_weights:
    contrastive:
      rounds: 0
      w0T:
      - 0
      - 0
    masked:
      rounds: 0
      w0T:
      - 0.8
      - 3
    smooth:
      rounds: 0
      w0T:
      - 0
      - 0.3
    unmasked:
      rounds: 0
      w0T:
      - 0.2
      - 0.3
  max_position_embeddings: 1149
  num_attention_heads: 12
  num_hidden_layers: 8
  pad_token_id: -1
  transform_numeric: false
  use_contrastive_loss: false
  use_mlm_loss: true
  use_regression_loss: false
  use_sdpa_attention: true
  use_xval_loss: false
  vocab_size: 2048
model_name_or_path: galaxybert
num_total_samples: -1
tokenizer_name_or_path: Salesforce/SFR-Embedding-Mistral
training_args:
  _n_gpu: 1
  accelerator_config:
    dispatch_batches: null
    even_batches: true
    gradient_accumulation_kwargs: null
    non_blocking: false
    split_batches: false
    use_configured_state: false
    use_seedable_sampler: true
  adafactor: false
  adam_beta1: 0.9
  adam_beta2: 0.999
  adam_epsilon: 1.0e-08
  auto_find_batch_size: false
  average_tokens_across_devices: true
  batch_eval_metrics: false
  bf16: true
  bf16_full_eval: false
  data_seed: null
  dataloader_drop_last: false
  dataloader_num_workers: 16
  dataloader_persistent_workers: false
  dataloader_pin_memory: true
  dataloader_prefetch_factor: 8
  ddp_backend: null
  ddp_broadcast_buffers: null
  ddp_bucket_cap_mb: null
  ddp_find_unused_parameters: null
  ddp_timeout: 1800
  debug: []
  deepspeed: null
  disable_tqdm: false
  do_eval: true
  do_predict: false
  do_train: false
  eval_accumulation_steps: 5
  eval_delay: 0
  eval_do_concat_batches: true
  eval_on_start: false
  eval_steps: 20
  eval_strategy: !!python/object/apply:transformers.trainer_utils.IntervalStrategy
  - steps
  eval_use_gather_object: false
  fp16: false
  fp16_backend: auto
  fp16_full_eval: false
  fp16_opt_level: O1
  fsdp: []
  fsdp_config:
    min_num_params: 0
    xla: false
    xla_fsdp_grad_ckpt: false
    xla_fsdp_v2: false
  fsdp_min_num_params: 0
  fsdp_transformer_layer_cls_to_wrap: null
  full_determinism: false
  gradient_accumulation_steps: 5
  gradient_checkpointing: false
  gradient_checkpointing_kwargs: null
  greater_is_better: null
  group_by_length: false
  half_precision_backend: auto
  hub_always_push: false
  hub_model_id: null
  hub_private_repo: null
  hub_revision: null
  hub_strategy: !!python/object/apply:transformers.trainer_utils.HubStrategy
  - every_save
  hub_token: null
  ignore_data_skip: false
  include_for_metrics: []
  include_inputs_for_metrics: false
  include_num_input_tokens_seen: 'no'
  include_tokens_per_second: false
  jit_mode_eval: false
  label_names: null
  label_smoothing_factor: 0.0
  learning_rate: 0.0001
  length_column_name: length
  liger_kernel_config: null
  load_best_model_at_end: false
  local_rank: 3
  log_level: passive
  log_level_replica: warning
  log_on_each_node: true
  logging_dir: sm_foundation_lg_gmm_nomasklab
  logging_first_step: true
  logging_nan_inf_filter: true
  logging_steps: 1
  logging_strategy: !!python/object/apply:transformers.trainer_utils.IntervalStrategy
  - steps
  lr_scheduler_kwargs: {}
  lr_scheduler_type: !!python/object/apply:transformers.trainer_utils.SchedulerType
  - cosine
  max_grad_norm: 1.0
  max_steps: -1
  metric_for_best_model: null
  mp_parameters: ''
  neftune_noise_alpha: null
  no_cuda: false
  num_train_epochs: 120
  optim: !!python/object/apply:transformers.training_args.OptimizerNames
  - adamw_torch
  optim_args: null
  optim_target_modules: null
  output_dir: supermock_light_nte120_nts-1
  overwrite_output_dir: true
  parallelism_config: null
  past_index: -1
  per_device_eval_batch_size: 40
  per_device_train_batch_size: 40
  per_gpu_eval_batch_size: null
  per_gpu_train_batch_size: null
  prediction_loss_only: false
  project: huggingface
  push_to_hub: false
  push_to_hub_model_id: null
  push_to_hub_organization: null
  push_to_hub_token: null
  ray_scope: last
  remove_unused_columns: false
  report_to:
  - wandb
  restore_callback_states_from_checkpoint: false
  resume_from_checkpoint: null
  run_name: NO_SHARD_b50
  save_on_each_node: false
  save_only_model: false
  save_safetensors: true
  save_steps: 30
  save_strategy: !!python/object/apply:transformers.trainer_utils.SaveStrategy
  - steps
  save_total_limit: 360
  seed: 42
  skip_memory_metrics: true
  tf32: null
  torch_compile: false
  torch_compile_backend: null
  torch_compile_mode: null
  torch_empty_cache_steps: null
  torchdynamo: null
  tpu_metrics_debug: false
  tpu_num_cores: null
  trackio_space_id: trackio
  use_cpu: false
  use_legacy_prediction_loop: false
  use_liger_kernel: false
  use_mps_device: false
  warmup_ratio: 0.0
  warmup_steps: 0
  weight_decay: 0.1
transform_numeric: false
wandb_project: supermock-foundation-perl
wandb_run_name: ''