Upload updated mosaic-light model

Browse files

Files changed (4) hide show

config.json +48 -0
foundation_bert.py +396 -0
model.safetensors +3 -0
train_config.yaml +247 -0

config.json ADDED Viewed

	@@ -0,0 +1,48 @@

+{
+  "_auto_class": "FoundationBert",
+  "auto_map": {
+    "AutoModel": "foundation_bert.FoundationBert"
+  },
+  "architectures": [
+    "FoundationBert"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": 50281,
+  "classifier_activation": "gelu",
+  "classifier_bias": false,
+  "classifier_dropout": 0.0,
+  "classifier_pooling": "cls",
+  "cls_token_id": 50281,
+  "decoder_bias": true,
+  "deterministic_flash_attn": false,
+  "dtype": "float32",
+  "embedding_dropout": 0.0,
+  "eos_token_id": 50282,
+  "global_attn_every_n_layers": 3,
+  "global_rope_theta": 160000.0,
+  "hidden_activation": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 384,
+  "initializer_cutoff_factor": 2.0,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "local_attention": 128,
+  "local_rope_theta": 10000.0,
+  "max_position_embeddings": 1149,
+  "mlp_bias": false,
+  "mlp_dropout": 0.0,
+  "model_type": "modernbert",
+  "norm_bias": false,
+  "norm_eps": 1e-05,
+  "num_attention_heads": 12,
+  "num_hidden_layers": 8,
+  "pad_token_id": -1,
+  "repad_logits_with_grad": false,
+  "sep_token_id": 50282,
+  "sparse_pred_ignore_index": -100,
+  "sparse_prediction": false,
+  "transformers_version": "4.57.1",
+  "vocab_size": 2048
+}

foundation_bert.py ADDED Viewed

	@@ -0,0 +1,396 @@

+import torch
+import os
+import yaml
+from pathlib import Path
+# from ..utils.masked_data_modeling_loss import MaskedDataLossWithSoftmax
+# from ..utils.contrastive_loss import ContrastiveLoss
+# from ..utils.yaml_util import MyLoader
+from dataclasses import dataclass
+from transformers import ModernBertModel, ModernBertConfig, PretrainedConfig
+from typing import Optional, Union
+# import yaml
+class MyLoader(yaml.SafeLoader):
+    # returns
+    def construct_mapping(self, *args, **kwargs):
+        super().add_constructor(None, construct_undefined)
+        # when loading we want to skip keys that require construction,
+        mapping = super().construct_mapping(*args, **kwargs)
+        return mapping
+import typing
+class Tagged(typing.NamedTuple):
+    tag: str
+    value: object
+def construct_undefined(self, node):
+    if isinstance(node, yaml.nodes.ScalarNode):
+        value = self.construct_scalar(node)
+    elif isinstance(node, yaml.nodes.SequenceNode):
+        value = self.construct_sequence(node)
+    elif isinstance(node, yaml.nodes.MappingNode):
+        value = self.construct_mapping(node)
+    else:
+        assert False, f"unexpected node: {node!r}"
+    return Tagged(node.tag, value)
+@dataclass
+class FoundationOutput:
+    loss: torch.Tensor = None
+    logits: torch.Tensor = None
+    num_output: torch.Tensor = None
+    est_err_output: torch.Tensor = None
+    hidden_states: torch.Tensor = None
+    masked_loss: torch.Tensor = None
+    num_loss: torch.Tensor = None
+    est_err_loss: torch.Tensor = None
+@dataclass
+class FoundationBertConfig:
+    vocab_size: int
+    hidden_size: int
+    num_hidden_layers: int
+    num_attention_heads: int
+    intermediate_size: int
+    hidden_dropout_prob: float
+    attention_probs_dropout_prob: float
+    pad_token_id: int
+    classifier_dropout: float
+    max_position_embeddings: int
+    contrastive_temperature: float
+    loss_weights: dict
+    use_xval_loss: bool = True
+    use_mlm_loss: bool = True
+    use_regression_loss: bool = False
+    use_contrastive_loss: bool = False
+    transform_numeric: bool = False
+    use_sdpa_attention: bool = True
+    def to_dict(self):
+        return {k: getattr(self, k) for k in self.__dataclass_fields__.keys()}
+class FoundationBert(ModernBertModel):
+    def __init__(self,
+                 config: FoundationBertConfig = None,
+                 use_mlm_loss: bool = False,
+                    use_regression_loss: bool = True,
+                    use_contrastive_loss: bool = False,
+                    use_xval_loss: bool = False,
+                    transform_numeric: bool = False,
+                 *args,
+                 **kwargs):
+        self.gconfig = config
+        # print(f"⚠️ FoundationBert.__init__: {self.gconfig=}")
+        bert_conf = ModernBertConfig(
+            vocab_size=config.vocab_size,
+            hidden_size=config.hidden_size,
+            num_hidden_layers=config.num_hidden_layers,
+            num_attention_heads=config.num_attention_heads,
+            intermediate_size=config.intermediate_size,
+            hidden_dropout_prob=config.hidden_dropout_prob,
+            attention_probs_dropout_prob=config.attention_probs_dropout_prob,
+            pad_token_id=config.pad_token_id,
+            max_position_embeddings=config.max_position_embeddings,
+            _attn_implementation='sdpa'
+        )
+        self.gconfig.transform_numeric = transform_numeric
+        super().__init__(bert_conf,)
+        try:
+            if not self.gconfig.use_mlm_loss and not self.gconfig.use_regression_loss and not self.gconfig.use_contrastive_loss:
+                raise ValueError("At least one loss must be enabled")
+            self.loss_mod = float(self.gconfig.use_mlm_loss) + float(self.gconfig.use_regression_loss) + float(self.gconfig.use_contrastive_loss) + float(self.gconfig.use_xval_loss)
+        except:
+            self.gconfig.use_mlm_loss = use_mlm_loss
+            self.gconfig.use_regression_loss = use_regression_loss
+            self.gconfig.use_contrastive_loss = use_contrastive_loss
+            self.gconfig.use_xval_loss = use_xval_loss
+            self.loss_mod = float(self.gconfig.use_mlm_loss) + float(self.gconfig.use_regression_loss) + float(self.gconfig.use_contrastive_loss) + float(self.gconfig.use_xval_loss)
+        self.dataset_path = kwargs.get('dataset_path', None)
+        self.vector_shape = kwargs['vector_shape']
+        self.scalar_shape = kwargs['scalar_shape']
+        self.mask_token = kwargs['mask_token']
+        # self.scalar_keys = [
+        #     'redshift',
+        #     'halo_mass',
+        #     'stellar_mass',
+        # ]
+        # self.vector_keys = [
+        #     'SED',
+        #     'SFH',
+        #     'mag_{band}_spherex',
+        #     'mag_{band}_lsst',
+        # ]
+        # convert modality names to 'scalars' or keep as is if in vector shape
+        self.modalscalars = [m if m in self.vector_shape else 'scalars' for m in self.modalities]
+        # remove duplicates while preserving order
+        self.modalscalars = list(dict.fromkeys(self.modalscalars))
+        print(f"✅ FoundationBert.__init__ is called with {kwargs=}, {self.modalscalars=}, {self.dataset_path=} ✅")
+        self.embedding = torch.nn.ModuleDict() # modality specific embedding layers
+        self.num_head = torch.nn.ModuleDict() # modality specific regression heads
+        # create modality specific layers
+        for modality in self.modalscalars:
+            self.embedding[modality] = torch.nn.Linear(1, config.hidden_size) # input.shape -> ouput.shape: (B, L, 1) -> (B, L, H)
+            self.num_head[modality] = torch.nn.Sequential(
+                torch.nn.Linear(config.hidden_size, config.hidden_size),
+                torch.nn.LayerNorm(config.hidden_size),
+                torch.nn.GELU(),
+                torch.nn.Linear(config.hidden_size, config.hidden_size // 2),
+                torch.nn.GELU(),
+                torch.nn.Linear(config.hidden_size // 2, 1)
+                )
+        # self.position_embeddings = torch.nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.embed_dropout = torch.nn.Dropout(config.hidden_dropout_prob)
+        # self.lm_head = torch.nn.Linear(config.hidden_size, config.vocab_size, bias=False) # isn't used currently
+        # self.xval_loss = torch.nn.MSELoss(reduction='none') # isn't used currently
+        # self.mlm_loss = MaskedDataLossWithSoftmax(ignore=-100, reduction='none') # isn't used currently
+        self.distributed_loss = False
+    @property
+    def modalities(self):
+        return self.vector_shape | self.scalar_shape
+    @classmethod
+    def from_pretrained(self,
+        pretrained_model_name_or_path: Optional[Union[str, os.PathLike]],
+        *model_args,
+        config: Optional[Union[PretrainedConfig, str, os.PathLike]] = None,
+        cache_dir: Optional[Union[str, os.PathLike]] = None,
+        ignore_mismatched_sizes: bool = False,
+        force_download: bool = False,
+        local_files_only: bool = False,
+        token: Optional[Union[str, bool]] = None,
+        revision: str = "main",
+        use_safetensors: bool = None,
+        **kwargs,
+    ):
+        """
+            Modification to correctly handle loading extraneous parameters for GBert
+        """
+        if 'checkpoint' in pretrained_model_name_or_path:
+            model_config = Path(pretrained_model_name_or_path).parent / 'train_config.yaml'
+        elif 'train_config.yaml' in os.listdir(pretrained_model_name_or_path):
+            model_config = Path(pretrained_model_name_or_path) / 'train_config.yaml'
+        else:
+            raise ValueError(f"Could not find train_config.yaml in {pretrained_model_name_or_path}")
+        with open(model_config, 'r') as f:
+            config = yaml.load(f, Loader=MyLoader)
+        kwargs['modalities'] = config['modalities']
+        kwargs['dataset_path'] = config['dataset_path']
+        kwargs['mask_token'] = config['mask_token']
+        if 'vector_shape' not in kwargs and 'vector_shape' in config:
+            kwargs['vector_shape'] = config['vector_shape']
+        if 'scalar_shape' not in kwargs and 'scalar_shape' in config:
+            kwargs['scalar_shape'] = config['scalar_shape']
+        print(f"✅ Foundationbert.from_pretrained is called with {model_config=} and {kwargs=} ✅")
+        return super().from_pretrained(
+            pretrained_model_name_or_path,
+            **config['model_config'],
+            **kwargs
+        )
+    def pool_output(self,
+        embeddings: torch.Tensor,
+        attention_mask: torch.Tensor,
+        use_last: bool = False
+    ) -> torch.Tensor:
+        """Average pool the hidden states using the attention mask.
+        Parameters
+        ----------
+        embeddings : torch.Tensor
+            The hidden states to pool (B, SeqLen, HiddenDim).
+        attention_mask : torch.Tensor
+            The attention mask for the hidden states (B, SeqLen).
+        Returns
+        -------
+        torch.Tensor
+            The pooled embeddings (B, HiddenDim).
+        """
+        # Get the sequence lengths
+        sl_mod = 1 if use_last else 2
+        seq_lengths = attention_mask.sum(axis=1)
+        # Set the attention mask to 0 for start and end tokens
+        new_attention = attention_mask.clone()
+        new_attention[:, 0] = attention_mask[:,0] * 0
+        new_attention[:, seq_lengths - sl_mod] =  0 * attention_mask[:, seq_lengths - sl_mod]
+        # Create a mask for the pooling operation (B, SeqLen, HiddenDim)
+        pool_mask = new_attention.unsqueeze(-1).expand(embeddings.shape).to(embeddings.device)
+        # Sum the embeddings over the sequence length (use the mask to avoid
+        # pad, start, and stop tokens)
+        sum_embeds = torch.sum(embeddings * pool_mask, 1)
+        # Avoid division by zero for zero length sequences by clamping
+        # sum_mask = torch.clamp(pool_mask.sum(1), min=1e-9)
+        seq_lengths = torch.clamp(seq_lengths, min=1).unsqueeze(-1)  # Shape (B, 1) to broadcast
+        # Compute mean pooled embeddings for each sequence
+        return sum_embeds / seq_lengths
+    def last_token_pool(
+            self,
+            embeddings: torch.Tensor,
+            attention_mask: torch.Tensor,
+        ) -> torch.Tensor:
+        """Pool the last hidden states using the attention mask.
+        Parameters
+        ----------
+        embeddings : torch.Tensor
+            The last hidden states to pool (B, SeqLen, HiddenDim).
+        attention_mask : torch.Tensor
+            The attention mask for the hidden states (B, SeqLen).
+        Returns
+        -------
+        torch.Tensor
+            The pooled embeddings (B, HiddenDim).
+        """
+        left_padding = attention_mask[:, -1].sum() == attention_mask.shape[0]
+        if left_padding:
+            return embeddings[:, -1]
+        else:
+            sequence_lengths = attention_mask.sum(dim=1) - 1
+            batch_size = embeddings.shape[0]
+            return embeddings[
+                torch.arange(batch_size, device=embeddings.device),
+                sequence_lengths,
+            ]
+    def forward(self, inputs, return_input_label_mapping=False):
+        """
+        Forward pass that computes predictions for each modality.
+        Args:
+            input_label_mapping (dict): A dictionary containing inputs and labels for different modalities.
+        Returns:
+            outputs (dict): A dictionary containing the logits and error logits for each modality.
+        """
+        # Initialize the dictionary for the dynamic input-label mapping
+        input_label_mapping = {}
+        combined = []
+        for src_modality in self.modalscalars:
+            # Add the modality's input and label data to the input_label_mapping
+            input_label_mapping[src_modality] = {
+                'input': inputs[f"input_{src_modality}"],  # Input data
+                'labels': inputs[f"labels_{src_modality}"]  # Corresponding labels
+            }
+            input_data = input_label_mapping[src_modality]['input'] # get input data
+            label = input_label_mapping[src_modality]['labels'] # get label data (for masking)
+            input_data = torch.where(label, self.mask_token, input_data) # apply masking
+            x = self.embedding[src_modality](input_data.unsqueeze(-1)) # shape: (B, L, H)
+            x = torch.nn.functional.silu(x)
+            combined.append(x) # combine all modalities
+        combined = torch.cat(combined, dim=1)  # Concatenate along the sequence length dimension
+        position_ids = torch.arange(combined.size(1)).unsqueeze(0).to(combined.device)  # shape: (1, L)
+        # combined += self.position_embeddings(position_ids) # add position embedding
+        combined = self.embed_dropout(combined)
+        # x = self.encoder(combined, output_hidden_states=True).last_hidden_state # encode the combined input
+        hidden_states = combined
+        for encoder_layer in self.layers:
+            hidden_states = encoder_layer(hidden_states, position_ids = position_ids)[0]
+        x = self.final_norm(hidden_states)
+        start = 0
+        outputs = {}
+        # Iterate over each target modality to compute logits
+        for tgt_modality in self.modalscalars:
+            length = input_label_mapping[tgt_modality]['input'].shape[1] # get sequence length of the modality
+            x_t = x[:, start:start+length, :] # slice the encoded output for each modality
+            outputs[f"{tgt_modality}_logits"] = self.num_head[tgt_modality](x_t) # modality specific regression head
+            start += length # update start index for next modality
+            if getattr(self, 'save_umap_for', None):
+                pooled = x_t.mean(dim=1)  # Mean pooling over the sequence length dimension
+                self.save_pooled_embedding(pooled) # saved for UMAP visualization
+        return (outputs, input_label_mapping) if return_input_label_mapping else outputs
+    def save_pooled_embedding(self, features):
+        """
+        Save the last hidden state to a file.
+        """
+        import h5py
+        fname = Path(self.save_umap_for)
+        fname.parent.mkdir(parents=True, exist_ok=True)
+        features = features.detach().cpu().numpy()
+        if fname.exists():
+            with h5py.File(fname, 'r+') as f:
+                old_size = f['features'].shape[0] # get current size
+                new_size = old_size + features.shape[0] # calculate new size
+                f['features'].resize((new_size, features.shape[-1])) # resize dataset
+                f['features'][old_size:] = features # append new features
+        else:
+            with h5py.File(fname, 'w') as f:
+                f.create_dataset('features', data=features, maxshape=(None, features.shape[-1]), chunks=True)
+    def get_retrieval_embedding(
+        self,
+        inputs,
+        pooling: str = "mean",
+        normalize: bool = True,
+    ) -> torch.Tensor:
+        """
+        Build a single embedding per sample for kNN-style retrieval.
+        Parameters
+        ----------
+        inputs : dict
+            Batch dict with `input_<modality>` and `labels_<modality>` entries.
+        pooling : str
+            `mean` (default) or `last`.
+        normalize : bool
+            L2-normalize output embeddings for cosine/inner-product search.
+        """
+        combined = []
+        for src_modality in self.modalscalars:
+            input_data = inputs[f"input_{src_modality}"]
+            label = inputs[f"labels_{src_modality}"]
+            input_data = torch.where(label, self.mask_token, input_data)
+            x = self.embedding[src_modality](input_data.unsqueeze(-1))
+            x = torch.nn.functional.silu(x)
+            combined.append(x)
+        combined = torch.cat(combined, dim=1)
+        position_ids = torch.arange(combined.size(1)).unsqueeze(0).to(combined.device)
+        combined = self.embed_dropout(combined)
+        hidden_states = combined
+        for encoder_layer in self.layers:
+            hidden_states = encoder_layer(hidden_states, position_ids=position_ids)[0]
+        hidden_states = self.final_norm(hidden_states)
+        if pooling == "last":
+            embedding = hidden_states[:, -1, :]
+        else:
+            embedding = hidden_states.mean(dim=1)
+        if normalize:
+            embedding = torch.nn.functional.normalize(embedding, p=2, dim=-1)
+        return embedding

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:80913251a26a74a8704f02417ae8abf0f2691fc94c4ef3ebf3bc62be8154659a
+size 139771732

train_config.yaml ADDED Viewed

	@@ -0,0 +1,247 @@

+dataset_path: /pscratch/sd/b/binxia/supermock_dataset_11.2-14.json
+input_errors:
+- 0
+- 0
+- 0
+- 0
+- 0
+- 0
+- 0
+mask_token: 0
+masked_generation: false
+masking_prob:
+- 0.2
+- 0.2
+- 0.2
+- 0.2
+- 0.5
+- 0.5
+- 0.5
+modalities:
+- SFH
+- SED
+- mag_{band}_spherex
+- mag_{band}_lsst
+- redshift
+- halo_mass
+- stellar_mass
+scalar_shape:
+  redshift:
+  - 20000
+  - 1
+  halo_mass:
+  - 20000
+  - 1
+  stellar_mass:
+  - 20000
+  - 1
+vector_shape:
+  SFH:
+  - 20000
+  - 117
+  SED:
+  - 20000
+  - 921
+  mag_{band}_spherex:
+  - 20000
+  - 102
+  mag_{band}_lsst:
+  - 20000
+  - 6
+model_config:
+  attention_probs_dropout_prob: 0.1
+  classifier_dropout: 0.0
+  contrastive_temperature: 0.05
+  hidden_dropout_prob: 0.1
+  hidden_size: 384
+  intermediate_size: 3072
+  loss_weights:
+    contrastive:
+      rounds: 0
+      w0T:
+      - 0
+      - 0
+    masked:
+      rounds: 0
+      w0T:
+      - 0.8
+      - 3
+    smooth:
+      rounds: 0
+      w0T:
+      - 0
+      - 0.3
+    unmasked:
+      rounds: 0
+      w0T:
+      - 0.2
+      - 0.3
+  max_position_embeddings: 1149
+  num_attention_heads: 12
+  num_hidden_layers: 8
+  pad_token_id: -1
+  transform_numeric: false
+  use_contrastive_loss: false
+  use_mlm_loss: true
+  use_regression_loss: false
+  use_sdpa_attention: true
+  use_xval_loss: false
+  vocab_size: 2048
+model_name_or_path: galaxybert
+num_total_samples: -1
+tokenizer_name_or_path: Salesforce/SFR-Embedding-Mistral
+training_args:
+  _n_gpu: 1
+  accelerator_config:
+    dispatch_batches: null
+    even_batches: true
+    gradient_accumulation_kwargs: null
+    non_blocking: false
+    split_batches: false
+    use_configured_state: false
+    use_seedable_sampler: true
+  adafactor: false
+  adam_beta1: 0.9
+  adam_beta2: 0.999
+  adam_epsilon: 1.0e-08
+  auto_find_batch_size: false
+  average_tokens_across_devices: true
+  batch_eval_metrics: false
+  bf16: true
+  bf16_full_eval: false
+  data_seed: null
+  dataloader_drop_last: false
+  dataloader_num_workers: 16
+  dataloader_persistent_workers: false
+  dataloader_pin_memory: true
+  dataloader_prefetch_factor: 8
+  ddp_backend: null
+  ddp_broadcast_buffers: null
+  ddp_bucket_cap_mb: null
+  ddp_find_unused_parameters: null
+  ddp_timeout: 1800
+  debug: []
+  deepspeed: null
+  disable_tqdm: false
+  do_eval: true
+  do_predict: false
+  do_train: false
+  eval_accumulation_steps: 5
+  eval_delay: 0
+  eval_do_concat_batches: true
+  eval_on_start: false
+  eval_steps: 20
+  eval_strategy: !!python/object/apply:transformers.trainer_utils.IntervalStrategy
+  - steps
+  eval_use_gather_object: false
+  fp16: false
+  fp16_backend: auto
+  fp16_full_eval: false
+  fp16_opt_level: O1
+  fsdp: []
+  fsdp_config:
+    min_num_params: 0
+    xla: false
+    xla_fsdp_grad_ckpt: false
+    xla_fsdp_v2: false
+  fsdp_min_num_params: 0
+  fsdp_transformer_layer_cls_to_wrap: null
+  full_determinism: false
+  gradient_accumulation_steps: 5
+  gradient_checkpointing: false
+  gradient_checkpointing_kwargs: null
+  greater_is_better: null
+  group_by_length: false
+  half_precision_backend: auto
+  hub_always_push: false
+  hub_model_id: null
+  hub_private_repo: null
+  hub_revision: null
+  hub_strategy: !!python/object/apply:transformers.trainer_utils.HubStrategy
+  - every_save
+  hub_token: null
+  ignore_data_skip: false
+  include_for_metrics: []
+  include_inputs_for_metrics: false
+  include_num_input_tokens_seen: 'no'
+  include_tokens_per_second: false
+  jit_mode_eval: false
+  label_names: null
+  label_smoothing_factor: 0.0
+  learning_rate: 0.0001
+  length_column_name: length
+  liger_kernel_config: null
+  load_best_model_at_end: false
+  local_rank: 3
+  log_level: passive
+  log_level_replica: warning
+  log_on_each_node: true
+  logging_dir: sm_foundation_lg_gmm_nomasklab
+  logging_first_step: true
+  logging_nan_inf_filter: true
+  logging_steps: 1
+  logging_strategy: !!python/object/apply:transformers.trainer_utils.IntervalStrategy
+  - steps
+  lr_scheduler_kwargs: {}
+  lr_scheduler_type: !!python/object/apply:transformers.trainer_utils.SchedulerType
+  - cosine
+  max_grad_norm: 1.0
+  max_steps: -1
+  metric_for_best_model: null
+  mp_parameters: ''
+  neftune_noise_alpha: null
+  no_cuda: false
+  num_train_epochs: 120
+  optim: !!python/object/apply:transformers.training_args.OptimizerNames
+  - adamw_torch
+  optim_args: null
+  optim_target_modules: null
+  output_dir: supermock_light_nte120_nts-1
+  overwrite_output_dir: true
+  parallelism_config: null
+  past_index: -1
+  per_device_eval_batch_size: 40
+  per_device_train_batch_size: 40
+  per_gpu_eval_batch_size: null
+  per_gpu_train_batch_size: null
+  prediction_loss_only: false
+  project: huggingface
+  push_to_hub: false
+  push_to_hub_model_id: null
+  push_to_hub_organization: null
+  push_to_hub_token: null
+  ray_scope: last
+  remove_unused_columns: false
+  report_to:
+  - wandb
+  restore_callback_states_from_checkpoint: false
+  resume_from_checkpoint: null
+  run_name: NO_SHARD_b50
+  save_on_each_node: false
+  save_only_model: false
+  save_safetensors: true
+  save_steps: 30
+  save_strategy: !!python/object/apply:transformers.trainer_utils.SaveStrategy
+  - steps
+  save_total_limit: 360
+  seed: 42
+  skip_memory_metrics: true
+  tf32: null
+  torch_compile: false
+  torch_compile_backend: null
+  torch_compile_mode: null
+  torch_empty_cache_steps: null
+  torchdynamo: null
+  tpu_metrics_debug: false
+  tpu_num_cores: null
+  trackio_space_id: trackio
+  use_cpu: false
+  use_legacy_prediction_loop: false
+  use_liger_kernel: false
+  use_mps_device: false
+  warmup_ratio: 0.0
+  warmup_steps: 0
+  weight_decay: 0.1
+transform_numeric: false
+wandb_project: supermock-foundation-perl
+wandb_run_name: ''