Initial upload of MOSAIC FoundationBert model, v1.0. Final successful local test.

Browse files

Files changed (15) hide show

__pycache__/foundation_bert.cpython-312.pyc +0 -0
config.json +29 -0
environment.yml +267 -0
example.ipynb +41 -0
foundation_bert.py +300 -0
model.safetensors +3 -0
requirements.txt +211 -0
setup.py +14 -0
train_config.yaml +213 -0
utils/__init__.py +0 -0
utils/__pycache__/__init__.cpython-312.pyc +0 -0
utils/__pycache__/masked_data_modeling_loss.cpython-312.pyc +0 -0
utils/__pycache__/yaml_util.cpython-312.pyc +0 -0
utils/masked_data_modeling_loss.py +24 -0
utils/yaml_util.py +24 -0

__pycache__/foundation_bert.cpython-312.pyc ADDED Viewed

Binary file (15.7 kB). View file

config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "_auto_class": "FoundationBert",
+  "auto_map": {
+    "AutoModel": "foundation_bert.py.FoundationBert"
+  },
+  "architectures": [
+    "FoundationBert"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "classifier_dropout": null,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 1149,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 18,
+  "pad_token_id": -1,
+  "position_embedding_type": "absolute",
+  "torch_dtype": "float32",
+  "transformers_version": "4.46.3",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 2048
+}

environment.yml ADDED Viewed

	@@ -0,0 +1,267 @@

+name: fsdp
+channels:
+  - conda-forge
+dependencies:
+  - _libgcc_mutex=0.1=conda_forge
+  - _openmp_mutex=4.5=2_gnu
+  - anyio=4.9.0=pyh29332c3_0
+  - archspec=0.2.3=pyhd8ed1ab_0
+  - argon2-cffi=23.1.0=pyhd8ed1ab_1
+  - argon2-cffi-bindings=21.2.0=py312h66e93f0_5
+  - arrow=1.3.0=pyhd8ed1ab_1
+  - asttokens=3.0.0=pyhd8ed1ab_1
+  - attrs=25.3.0=pyh71513ae_0
+  - beautifulsoup4=4.13.3=pyha770c72_0
+  - bleach=6.2.0=pyh29332c3_4
+  - bleach-with-css=6.2.0=h82add2a_4
+  - boltons=24.0.0=pyhd8ed1ab_0
+  - brotli-python=1.1.0=py312h2ec8cdc_2
+  - bzip2=1.0.8=h4bc722e_7
+  - c-ares=1.32.3=h4bc722e_0
+  - ca-certificates=2025.4.26=hbd8a1cb_0
+  - cached-property=1.5.2=hd8ed1ab_1
+  - cached_property=1.5.2=pyha770c72_1
+  - certifi=2025.4.26=pyhd8ed1ab_0
+  - cffi=1.17.0=py312h06ac9bb_1
+  - charset-normalizer=3.3.2=pyhd8ed1ab_0
+  - colorama=0.4.6=pyhd8ed1ab_0
+  - comm=0.2.2=pyhd8ed1ab_1
+  - conda-package-handling=2.3.0=pyh7900ff3_0
+  - conda-package-streaming=0.10.0=pyhd8ed1ab_0
+  - debugpy=1.8.13=py312h2ec8cdc_0
+  - decorator=5.2.1=pyhd8ed1ab_0
+  - defusedxml=0.7.1=pyhd8ed1ab_0
+  - distro=1.9.0=pyhd8ed1ab_0
+  - exceptiongroup=1.2.2=pyhd8ed1ab_1
+  - fmt=11.0.2=h434a139_0
+  - fqdn=1.5.1=pyhd8ed1ab_1
+  - frozendict=2.4.4=py312h9a8786e_0
+  - gitdb=4.0.12=pyhd8ed1ab_0
+  - gitpython=3.1.44=pyhff2d567_0
+  - h2=4.1.0=pyhd8ed1ab_0
+  - hpack=4.0.0=pyh9f0ad1d_0
+  - hyperframe=6.0.1=pyhd8ed1ab_0
+  - icu=75.1=he02047a_0
+  - idna=3.8=pyhd8ed1ab_0
+  - importlib-metadata=8.6.1=pyha770c72_0
+  - importlib_resources=6.5.2=pyhd8ed1ab_0
+  - ipykernel=6.29.5=pyh3099207_0
+  - ipython_pygments_lexers=1.1.1=pyhd8ed1ab_0
+  - isoduration=20.11.0=pyhd8ed1ab_1
+  - jedi=0.19.2=pyhd8ed1ab_1
+  - jq=1.7.1=hd590300_0
+  - jsonpatch=1.33=pyhd8ed1ab_0
+  - jsonpointer=3.0.0=py312h7900ff3_1
+  - jsonschema=4.23.0=pyhd8ed1ab_1
+  - jsonschema-specifications=2024.10.1=pyhd8ed1ab_1
+  - jsonschema-with-format-nongpl=4.23.0=hd8ed1ab_1
+  - jupyter-server-mathjax=0.2.6=pyhbbac1ac_2
+  - jupyter_client=8.6.3=pyhd8ed1ab_1
+  - jupyter_core=5.7.2=pyh31011fe_1
+  - jupyter_events=0.12.0=pyh29332c3_0
+  - jupyter_server=2.15.0=pyhd8ed1ab_0
+  - jupyter_server_terminals=0.5.3=pyhd8ed1ab_1
+  - jupyterlab_pygments=0.3.0=pyhd8ed1ab_2
+  - keyutils=1.6.1=h166bdaf_0
+  - krb5=1.21.3=h659f571_0
+  - ld_impl_linux-64=2.40=hf3520f5_7
+  - libarchive=3.7.4=hfca40fe_0
+  - libcurl=8.10.1=hbbe4b11_0
+  - libedit=3.1.20191231=he28a2e2_2
+  - libev=4.33=hd590300_2
+  - libexpat=2.6.2=h59595ed_0
+  - libffi=3.4.2=h7f98852_5
+  - libgcc=14.1.0=h77fa898_1
+  - libgcc-ng=14.1.0=h69a702a_1
+  - libgomp=14.1.0=h77fa898_1
+  - libiconv=1.17=hd590300_2
+  - libmamba=1.5.10=hf72d635_1
+  - libmambapy=1.5.10=py312hf3f0a4e_1
+  - libnghttp2=1.58.0=h47da74e_1
+  - libnsl=2.0.1=hd590300_0
+  - libsodium=1.0.20=h4ab18f5_0
+  - libsolv=0.7.30=h3509ff9_0
+  - libsqlite=3.46.1=hadc24fc_0
+  - libssh2=1.11.0=h0841786_0
+  - libstdcxx=14.1.0=hc0a3c3a_1
+  - libstdcxx-ng=14.1.0=h4852527_1
+  - libuuid=2.38.1=h0b41bf4_0
+  - libxcrypt=4.4.36=hd590300_1
+  - libxml2=2.12.7=he7c6b58_4
+  - libzlib=1.3.1=h4ab18f5_1
+  - lz4-c=1.9.4=hcb278e6_0
+  - lzo=2.10=hd590300_1001
+  - markupsafe=3.0.2=py312h178313f_1
+  - matplotlib-inline=0.1.7=pyhd8ed1ab_1
+  - menuinst=2.1.2=py312h7900ff3_1
+  - mistune=3.1.3=pyh29332c3_0
+  - nbclient=0.10.2=pyhd8ed1ab_0
+  - nbconvert-core=7.16.6=pyh29332c3_0
+  - nbdime=4.0.2=pyhd8ed1ab_1
+  - nbformat=5.10.4=pyhd8ed1ab_1
+  - ncurses=6.5=he02047a_1
+  - nest-asyncio=1.6.0=pyhd8ed1ab_1
+  - oniguruma=6.9.10=hb9d3cd8_0
+  - openssl=3.5.0=h7b32b05_1
+  - overrides=7.7.0=pyhd8ed1ab_1
+  - packaging=24.1=pyhd8ed1ab_0
+  - parso=0.8.4=pyhd8ed1ab_1
+  - pexpect=4.9.0=pyhd8ed1ab_1
+  - pickleshare=0.7.5=pyhd8ed1ab_1004
+  - pip=24.2=pyh8b19718_1
+  - pkgutil-resolve-name=1.3.10=pyhd8ed1ab_2
+  - platformdirs=4.2.2=pyhd8ed1ab_0
+  - pluggy=1.5.0=pyhd8ed1ab_0
+  - prometheus_client=0.21.1=pyhd8ed1ab_0
+  - prompt-toolkit=3.0.50=pyha770c72_0
+  - psutil=7.0.0=py312h66e93f0_0
+  - ptyprocess=0.7.0=pyhd8ed1ab_1
+  - pure_eval=0.2.3=pyhd8ed1ab_1
+  - pybind11-abi=4=hd8ed1ab_3
+  - pycosat=0.6.6=py312h98912ed_0
+  - pycparser=2.22=pyhd8ed1ab_0
+  - pygments=2.19.1=pyhd8ed1ab_0
+  - pysocks=1.7.1=pyha2e5f31_6
+  - python=3.12.5=h2ad013b_0_cpython
+  - python-dateutil=2.9.0.post0=pyhff2d567_1
+  - python-fastjsonschema=2.21.1=pyhd8ed1ab_0
+  - python_abi=3.12=5_cp312
+  - pyyaml=6.0.2=py312h178313f_2
+  - pyzmq=26.2.1=py312hbf22597_0
+  - readline=8.2=h8228510_1
+  - referencing=0.36.2=pyh29332c3_0
+  - reproc=14.2.4.post0=hd590300_1
+  - reproc-cpp=14.2.4.post0=h59595ed_1
+  - requests=2.32.3=pyhd8ed1ab_0
+  - rfc3339-validator=0.1.4=pyhd8ed1ab_1
+  - rfc3986-validator=0.1.1=pyh9f0ad1d_0
+  - ruamel.yaml=0.18.6=py312h98912ed_0
+  - ruamel.yaml.clib=0.2.8=py312h98912ed_0
+  - send2trash=1.8.3=pyh0d859eb_1
+  - six=1.17.0=pyhd8ed1ab_0
+  - smmap=5.0.2=pyhd8ed1ab_0
+  - sniffio=1.3.1=pyhd8ed1ab_1
+  - stack_data=0.6.3=pyhd8ed1ab_1
+  - terminado=0.18.1=pyh0d859eb_0
+  - tinycss2=1.4.0=pyhd8ed1ab_0
+  - tk=8.6.13=noxft_h4845f30_101
+  - tornado=6.4.2=py312h66e93f0_0
+  - tqdm=4.66.5=pyhd8ed1ab_0
+  - traitlets=5.14.3=pyhd8ed1ab_1
+  - truststore=0.9.2=pyhd8ed1ab_0
+  - types-python-dateutil=2.9.0.20241206=pyhd8ed1ab_0
+  - typing-extensions=4.12.2=hd8ed1ab_1
+  - typing_extensions=4.12.2=pyha770c72_1
+  - typing_utils=0.1.0=pyhd8ed1ab_1
+  - uri-template=1.3.0=pyhd8ed1ab_1
+  - urllib3=2.2.2=pyhd8ed1ab_1
+  - wcwidth=0.2.13=pyhd8ed1ab_1
+  - webcolors=24.11.1=pyhd8ed1ab_0
+  - webencodings=0.5.1=pyhd8ed1ab_3
+  - websocket-client=1.8.0=pyhd8ed1ab_1
+  - wheel=0.44.0=pyhd8ed1ab_0
+  - xz=5.2.6=h166bdaf_0
+  - yaml=0.2.5=h7f98852_2
+  - yaml-cpp=0.8.0=h59595ed_0
+  - zeromq=4.3.5=h3b0a872_7
+  - zipp=3.21.0=pyhd8ed1ab_1
+  - zstandard=0.23.0=py312hef9b889_1
+  - zstd=1.5.6=ha6fb4c9_0
+  - pip:
+      - accelerate==1.1.1
+      - alphashape==1.3.1
+      - annotated-types==0.7.0
+      - async-lru==2.0.5
+      - babel==2.17.0
+      - brokenaxes==0.6.2
+      - click==8.1.8
+      - click-log==0.4.0
+      - contourpy==1.3.1
+      - cycler==0.12.1
+      - descartes==1.1.0
+      - docker-pycreds==0.4.0
+      - einops==0.8.0
+      - executing==2.2.0
+      - filelock==3.17.0
+      - fonttools==4.56.0
+      - fsspec==2025.2.0
+      - h11==0.14.0
+      - hjson==3.1.0
+      - httpcore==1.0.7
+      - httpx==0.28.1
+      - huggingface-hub==0.29.1
+      - ijson==3.4.0
+      - ipympl==0.9.7
+      - ipython==9.0.2
+      - ipywidgets==8.1.5
+      - jinja2==3.1.5
+      - joblib==1.4.2
+      - json5==0.10.0
+      - jupyter-lsp==2.2.5
+      - jupyterlab==4.3.6
+      - jupyterlab-server==2.27.3
+      - jupyterlab-widgets==3.0.13
+      - kiwisolver==1.4.8
+      - llvmlite==0.44.0
+      - matplotlib==3.10.1
+      - mpi4py==4.0.3
+      - mpmath==1.3.0
+      - msgpack==1.1.0
+      - narwhals==1.42.1
+      - networkx==3.4.2
+      - ninja==1.11.1.3
+      - notebook==7.3.3
+      - notebook-shim==0.2.4
+      - numba==0.61.2
+      - numpy==2.2.3
+      - nvidia-cublas-cu12==12.4.5.8
+      - nvidia-cuda-cupti-cu12==12.4.127
+      - nvidia-cuda-nvrtc-cu12==12.4.127
+      - nvidia-cuda-runtime-cu12==12.4.127
+      - nvidia-cudnn-cu12==9.1.0.70
+      - nvidia-cufft-cu12==11.2.1.3
+      - nvidia-curand-cu12==10.3.5.147
+      - nvidia-cusolver-cu12==11.6.1.9
+      - nvidia-cusparse-cu12==12.3.1.170
+      - nvidia-cusparselt-cu12==0.6.2
+      - nvidia-ml-py==12.570.86
+      - nvidia-nccl-cu12==2.21.5
+      - nvidia-nvjitlink-cu12==12.4.127
+      - nvidia-nvtx-cu12==12.4.127
+      - pandas==2.3.0
+      - pandocfilters==1.5.1
+      - pillow==11.1.0
+      - plotly==6.1.2
+      - protobuf==5.29.3
+      - py-cpuinfo==9.0.0
+      - pydantic==2.10.6
+      - pydantic-core==2.27.2
+      - pynndescent==0.5.13
+      - pyparsing==3.2.1
+      - python-json-logger==3.3.0
+      - pytz==2025.2
+      - regex==2024.11.6
+      - rpds-py==0.23.1
+      - rtree==1.4.1
+      - safetensors==0.5.3
+      - scikit-learn==1.6.1
+      - scipy==1.15.2
+      - sentry-sdk==2.22.0
+      - setproctitle==1.3.5
+      - setuptools==75.8.2
+      - shapely==2.1.2
+      - soupsieve==2.6
+      - sympy==1.13.1
+      - threadpoolctl==3.6.0
+      - tokenizers==0.20.3
+      - torch==2.6.0
+      - torchaudio==2.6.0
+      - torchvision==0.21.0
+      - transformers==4.46.3
+      - trimesh==4.8.3
+      - triton==3.2.0
+      - tzdata==2025.2
+      - umap-learn==0.5.7
+      - wandb==0.18.3
+      - widgetsnbextension==4.0.13
+prefix: /global/homes/b/binxia/.conda/envs/fsdp

example.ipynb ADDED Viewed

	@@ -0,0 +1,41 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "07604227",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9882fd75",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "fsdp",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

foundation_bert.py ADDED Viewed

	@@ -0,0 +1,300 @@

+import torch
+import os
+import yaml
+from pathlib import Path
+from utils.masked_data_modeling_loss import MaskedDataLossWithSoftmax
+# from ..utils.contrastive_loss import ContrastiveLoss
+from utils.yaml_util import MyLoader
+from dataclasses import dataclass
+from transformers import BertModel, BertConfig, PretrainedConfig
+from typing import Optional, Union
+@dataclass
+class FoundationOutput:
+    loss: torch.Tensor = None
+    logits: torch.Tensor = None
+    num_output: torch.Tensor = None
+    est_err_output: torch.Tensor = None
+    hidden_states: torch.Tensor = None
+    masked_loss: torch.Tensor = None
+    num_loss: torch.Tensor = None
+    est_err_loss: torch.Tensor = None
+@dataclass
+class FoundationBertConfig:
+    vocab_size: int
+    hidden_size: int
+    num_hidden_layers: int
+    num_attention_heads: int
+    intermediate_size: int
+    hidden_dropout_prob: float
+    attention_probs_dropout_prob: float
+    pad_token_id: int
+    classifier_dropout: float
+    max_position_embeddings: int
+    contrastive_temperature: float
+    loss_weights: dict
+    use_xval_loss: bool = True
+    use_mlm_loss: bool = True
+    use_regression_loss: bool = False
+    use_contrastive_loss: bool = False
+    transform_numeric: bool = False
+    def to_dict(self):
+        return {k: getattr(self, k) for k in self.__dataclass_fields__.keys()}
+class FoundationBert(BertModel):
+    def __init__(self,
+                 config: FoundationBertConfig = None,
+                 use_mlm_loss: bool = False,
+                    use_regression_loss: bool = True,
+                    use_contrastive_loss: bool = False,
+                    use_xval_loss: bool = False,
+                    transform_numeric: bool = False,
+                 *args,
+                 **kwargs):
+        self.gconfig = config
+        # print(f"⚠️ FoundationBert.__init__: {self.gconfig=}")
+        bert_conf = BertConfig(
+            vocab_size=config.vocab_size,
+            hidden_size=config.hidden_size,
+            num_hidden_layers=config.num_hidden_layers,
+            num_attention_heads=config.num_attention_heads,
+            intermediate_size=config.intermediate_size,
+            hidden_dropout_prob=config.hidden_dropout_prob,
+            attention_probs_dropout_prob=config.attention_probs_dropout_prob,
+            pad_token_id=config.pad_token_id,
+            max_position_embeddings=config.max_position_embeddings,
+            _attn_implementation='sdpa'
+        )
+        self.gconfig.transform_numeric = transform_numeric
+        super().__init__(bert_conf,)
+        try:
+            if not self.gconfig.use_mlm_loss and not self.gconfig.use_regression_loss and not self.gconfig.use_contrastive_loss:
+                raise ValueError("At least one loss must be enabled")
+            self.loss_mod = float(self.gconfig.use_mlm_loss) + float(self.gconfig.use_regression_loss) + float(self.gconfig.use_contrastive_loss) + float(self.gconfig.use_xval_loss)
+        except:
+            self.gconfig.use_mlm_loss = use_mlm_loss
+            self.gconfig.use_regression_loss = use_regression_loss
+            self.gconfig.use_contrastive_loss = use_contrastive_loss
+            self.gconfig.use_xval_loss = use_xval_loss
+            self.loss_mod = float(self.gconfig.use_mlm_loss) + float(self.gconfig.use_regression_loss) + float(self.gconfig.use_contrastive_loss) + float(self.gconfig.use_xval_loss)
+        self.dataset_path = kwargs.get('dataset_path', None)
+        self.modalities = kwargs['modalities']
+        self.mask_token = kwargs['mask_token']
+        self.scalar_keys = [
+            'redshift',
+            'halo_mass',
+            'stellar_mass',
+        ]
+        self.vector_keys = [
+            'SED',
+            'SFH',
+            'mag_{band}_spherex',
+            'mag_{band}_lsst',
+        ]
+        self.modalscalars = [m if m in self.vector_keys else 'scalars' for m in self.modalities]
+        self.modalscalars = list(dict.fromkeys(self.modalscalars))
+        # print(f"✅ FoundationBert.__init__ is called with {kwargs=}, {self.modalscalars=}, {self.dataset_path=} ✅")
+        self.embedding = torch.nn.ModuleDict() # modality specific embedding layers
+        self.num_head = torch.nn.ModuleDict() # modality specific regression heads
+        # create modality specific layers
+        for modality in self.modalscalars:
+            self.embedding[modality] = torch.nn.Linear(1, config.hidden_size) # input.shape -> ouput.shape: (B, L, 1) -> (B, L, H)
+            self.num_head[modality] = torch.nn.Sequential(
+                torch.nn.Linear(config.hidden_size, config.hidden_size),
+                torch.nn.LayerNorm(config.hidden_size),
+                torch.nn.GELU(),
+                torch.nn.Linear(config.hidden_size, config.hidden_size // 2),
+                torch.nn.GELU(),
+                torch.nn.Linear(config.hidden_size // 2, 1)
+                )
+        self.position_embeddings = torch.nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.embed_dropout = torch.nn.Dropout(config.hidden_dropout_prob)
+        self.lm_head = torch.nn.Linear(config.hidden_size, config.vocab_size, bias=False) # isn't used currently
+        self.xval_loss = torch.nn.MSELoss(reduction='none') # isn't used currently
+        self.mlm_loss = MaskedDataLossWithSoftmax(ignore=-100, reduction='none') # isn't used currently
+        self.distributed_loss = False
+    @classmethod
+    def from_pretrained(self,
+        pretrained_model_name_or_path: Optional[Union[str, os.PathLike]],
+        *model_args,
+        config: Optional[Union[PretrainedConfig, str, os.PathLike]] = None,
+        cache_dir: Optional[Union[str, os.PathLike]] = None,
+        ignore_mismatched_sizes: bool = False,
+        force_download: bool = False,
+        local_files_only: bool = False,
+        token: Optional[Union[str, bool]] = None,
+        revision: str = "main",
+        use_safetensors: bool = None,
+        **kwargs,
+    ):
+        """
+            Modification to correctly handle loading extraneous parameters for GBert
+        """
+        model_config = Path(pretrained_model_name_or_path) / 'train_config.yaml'
+        with open(model_config, 'r') as f:
+            config = yaml.load(f, Loader=MyLoader)
+        kwargs['modalities'] = config['modalities']
+        kwargs['dataset_path'] = config['dataset_path']
+        kwargs['mask_token'] = config['mask_token']
+        # print(f"✅ Foundationbert.from_pretrained is called with {model_config=} and {kwargs=} ✅")
+        return super().from_pretrained(
+            pretrained_model_name_or_path,
+            **config['model_config'],
+            **kwargs
+        )
+    def pool_output(self,
+        embeddings: torch.Tensor,
+        attention_mask: torch.Tensor,
+        use_last: bool = False
+    ) -> torch.Tensor:
+        """Average pool the hidden states using the attention mask.
+        Parameters
+        ----------
+        embeddings : torch.Tensor
+            The hidden states to pool (B, SeqLen, HiddenDim).
+        attention_mask : torch.Tensor
+            The attention mask for the hidden states (B, SeqLen).
+        Returns
+        -------
+        torch.Tensor
+            The pooled embeddings (B, HiddenDim).
+        """
+        # Get the sequence lengths
+        sl_mod = 1 if use_last else 2
+        seq_lengths = attention_mask.sum(axis=1)
+        # Set the attention mask to 0 for start and end tokens
+        new_attention = attention_mask.clone()
+        new_attention[:, 0] = attention_mask[:,0] * 0
+        new_attention[:, seq_lengths - sl_mod] =  0 * attention_mask[:, seq_lengths - sl_mod]
+        # Create a mask for the pooling operation (B, SeqLen, HiddenDim)
+        pool_mask = new_attention.unsqueeze(-1).expand(embeddings.shape).to(embeddings.device)
+        # Sum the embeddings over the sequence length (use the mask to avoid
+        # pad, start, and stop tokens)
+        sum_embeds = torch.sum(embeddings * pool_mask, 1)
+        # Avoid division by zero for zero length sequences by clamping
+        # sum_mask = torch.clamp(pool_mask.sum(1), min=1e-9)
+        seq_lengths = torch.clamp(seq_lengths, min=1).unsqueeze(-1)  # Shape (B, 1) to broadcast
+        # Compute mean pooled embeddings for each sequence
+        return sum_embeds / seq_lengths
+    def last_token_pool(
+            self,
+            embeddings: torch.Tensor,
+            attention_mask: torch.Tensor,
+        ) -> torch.Tensor:
+        """Pool the last hidden states using the attention mask.
+        Parameters
+        ----------
+        embeddings : torch.Tensor
+            The last hidden states to pool (B, SeqLen, HiddenDim).
+        attention_mask : torch.Tensor
+            The attention mask for the hidden states (B, SeqLen).
+        Returns
+        -------
+        torch.Tensor
+            The pooled embeddings (B, HiddenDim).
+        """
+        left_padding = attention_mask[:, -1].sum() == attention_mask.shape[0]
+        if left_padding:
+            return embeddings[:, -1]
+        else:
+            sequence_lengths = attention_mask.sum(dim=1) - 1
+            batch_size = embeddings.shape[0]
+            return embeddings[
+                torch.arange(batch_size, device=embeddings.device),
+                sequence_lengths,
+            ]
+    def forward(self, inputs, return_input_label_mapping=False):
+        """
+        Forward pass that computes predictions for each modality.
+        Args:
+            input_label_mapping (dict): A dictionary containing inputs and labels for different modalities.
+        Returns:
+            outputs (dict): A dictionary containing the logits and error logits for each modality.
+        """
+        # Initialize the dictionary for the dynamic input-label mapping
+        input_label_mapping = {}
+        combined = []
+        for src_modality in self.modalscalars:
+            # Add the modality's input and label data to the input_label_mapping
+            input_label_mapping[src_modality] = {
+                'input': inputs[f"input_{src_modality}"],  # Input data
+                'labels': inputs[f"labels_{src_modality}"]  # Corresponding labels
+            }
+            input_data = input_label_mapping[src_modality]['input'] # get input data
+            label = input_label_mapping[src_modality]['labels'] # get label data (for masking)
+            input_data = torch.where(label, self.mask_token, input_data) # apply masking
+            x = self.embedding[src_modality](input_data.unsqueeze(-1)) # shape: (B, L, H)
+            x = torch.nn.functional.silu(x)
+            combined.append(x) # combine all modalities
+        combined = torch.cat(combined, dim=1)  # Concatenate along the sequence length dimension
+        self.position_ids = torch.arange(combined.size(1)).unsqueeze(0).to(combined.device)  # shape: (1, L)
+        combined += self.position_embeddings(self.position_ids) # add position embedding
+        combined = self.embed_dropout(combined)
+        x = self.encoder(combined, output_hidden_states=True).last_hidden_state # encode the combined input
+        start = 0
+        outputs = {}
+        # Iterate over each target modality to compute logits
+        for tgt_modality in self.modalscalars:
+            length = input_label_mapping[tgt_modality]['input'].shape[1] # get sequence length of the modality
+            x_t = x[:, start:start+length, :] # slice the encoded output for each modality
+            outputs[f"{tgt_modality}_logits"] = self.num_head[tgt_modality](x_t) # modality specific regression head
+            start += length # update start index for next modality
+            if getattr(self, 'save_umap_for', None):
+                pooled = x_t.mean(dim=1)  # Mean pooling over the sequence length dimension
+                self.save_pooled_embedding(pooled) # saved for UMAP visualization
+        return (outputs, input_label_mapping) if return_input_label_mapping else outputs
+    def save_pooled_embedding(self, features):
+        """
+        Save the last hidden state to a file.
+        """
+        import h5py
+        fname = Path(self.save_umap_for)
+        fname.parent.mkdir(parents=True, exist_ok=True)
+        features = features.detach().cpu().numpy()
+        if fname.exists():
+            with h5py.File(fname, 'r+') as f:
+                old_size = f['features'].shape[0] # get current size
+                new_size = old_size + features.shape[0] # calculate new size
+                f['features'].resize((new_size, features.shape[-1])) # resize dataset
+                f['features'][old_size:] = features # append new features
+        else:
+            with h5py.File(fname, 'w') as f:
+                f.create_dataset('features', data=features, maxshape=(None, features.shape[-1]), chunks=True)

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d2acc57f8c67e0f2b358632241243752031c23a7ed7030ba95c33b7f81e06c62
+size 550172096

requirements.txt ADDED Viewed

	@@ -0,0 +1,211 @@

+accelerate==1.1.1
+alphashape==1.3.1
+annotated-types==0.7.0
+anyio @ file:///home/conda/feedstock_root/build_artifacts/bld/rattler-build_anyio_1742243108/work
+appdirs==1.4.4
+archspec @ file:///home/conda/feedstock_root/build_artifacts/archspec_1708969572489/work
+argon2-cffi @ file:///home/conda/feedstock_root/build_artifacts/argon2-cffi_1733311059102/work
+argon2-cffi-bindings @ file:///home/conda/feedstock_root/build_artifacts/argon2-cffi-bindings_1725356585055/work
+arrow @ file:///home/conda/feedstock_root/build_artifacts/arrow_1733584251875/work
+asttokens @ file:///home/conda/feedstock_root/build_artifacts/asttokens_1733250440834/work
+async-lru==2.0.5
+attrs @ file:///home/conda/feedstock_root/build_artifacts/attrs_1741918516150/work
+babel==2.17.0
+beautifulsoup4 @ file:///home/conda/feedstock_root/build_artifacts/beautifulsoup4_1738740337718/work
+bleach @ file:///home/conda/feedstock_root/build_artifacts/bld/rattler-build_bleach_1737382993/work
+boltons @ file:///home/conda/feedstock_root/build_artifacts/boltons_1711936407380/work
+brokenaxes==0.6.2
+Brotli @ file:///home/conda/feedstock_root/build_artifacts/brotli-split_1725267488082/work
+cached-property @ file:///home/conda/feedstock_root/build_artifacts/cached_property_1615209429212/work
+certifi @ file:///home/conda/feedstock_root/build_artifacts/certifi_1746569525376/work/certifi
+cffi @ file:///home/conda/feedstock_root/build_artifacts/cffi_1724956320552/work
+charset-normalizer @ file:///home/conda/feedstock_root/build_artifacts/charset-normalizer_1698833585322/work
+# Editable install with no version control (chatarena==0.1.8)
+-e /pscratch/sd/b/binxia/Werewolf
+click==8.1.8
+click-log==0.4.0
+colorama @ file:///home/conda/feedstock_root/build_artifacts/colorama_1666700638685/work
+comm @ file:///home/conda/feedstock_root/build_artifacts/comm_1733502965406/work
+conda-package-handling @ file:///home/conda/feedstock_root/build_artifacts/conda-package-handling_1717678605937/work
+conda_package_streaming @ file:///home/conda/feedstock_root/build_artifacts/conda-package-streaming_1717678526951/work
+configparser==7.2.0
+contourpy==1.3.1
+cycler==0.12.1
+debugpy @ file:///home/conda/feedstock_root/build_artifacts/debugpy_1741148399929/work
+decorator @ file:///home/conda/feedstock_root/build_artifacts/decorator_1740384970518/work
+defusedxml @ file:///home/conda/feedstock_root/build_artifacts/defusedxml_1615232257335/work
+descartes==1.1.0
+distro @ file:///home/conda/feedstock_root/build_artifacts/distro_1704321475663/work
+docker-pycreds==0.4.0
+einops==0.8.0
+exceptiongroup @ file:///home/conda/feedstock_root/build_artifacts/exceptiongroup_1733208806608/work
+executing==2.2.0
+fastjsonschema @ file:///home/conda/feedstock_root/build_artifacts/python-fastjsonschema_1733235979760/work/dist
+filelock==3.17.0
+fonttools==4.56.0
+fqdn @ file:///home/conda/feedstock_root/build_artifacts/fqdn_1733327382592/work/dist
+frozendict @ file:///home/conda/feedstock_root/build_artifacts/frozendict_1715092752354/work
+fsspec==2025.2.0
+gitdb @ file:///home/conda/feedstock_root/build_artifacts/gitdb_1735887193964/work
+GitPython @ file:///home/conda/feedstock_root/build_artifacts/gitpython_1735929639977/work
+h11==0.14.0
+h2 @ file:///home/conda/feedstock_root/build_artifacts/h2_1634280454336/work
+hjson==3.1.0
+hpack==4.0.0
+httpcore==1.0.7
+httpx==0.28.1
+huggingface-hub==0.29.1
+hyperframe @ file:///home/conda/feedstock_root/build_artifacts/hyperframe_1619110129307/work
+idna @ file:///home/conda/feedstock_root/build_artifacts/idna_1724450538981/work
+ijson==3.4.0
+importlib_metadata @ file:///home/conda/feedstock_root/build_artifacts/importlib-metadata_1737420181517/work
+importlib_resources @ file:///home/conda/feedstock_root/build_artifacts/importlib_resources_1736252299705/work
+ipykernel @ file:///home/conda/feedstock_root/build_artifacts/ipykernel_1719845459717/work
+ipympl==0.9.7
+ipython==9.0.2
+ipython_pygments_lexers @ file:///home/conda/feedstock_root/build_artifacts/ipython_pygments_lexers_1737123620466/work
+ipywidgets==8.1.5
+isoduration @ file:///home/conda/feedstock_root/build_artifacts/isoduration_1733493628631/work/dist
+jedi @ file:///home/conda/feedstock_root/build_artifacts/jedi_1733300866624/work
+Jinja2==3.1.5
+joblib==1.4.2
+json5==0.10.0
+jsonpatch @ file:///home/conda/feedstock_root/build_artifacts/jsonpatch_1695536281965/work
+jsonpointer @ file:///home/conda/feedstock_root/build_artifacts/jsonpointer_1725302935093/work
+jsonschema @ file:///home/conda/feedstock_root/build_artifacts/jsonschema_1733472696581/work
+jsonschema-specifications @ file:///tmp/tmpk0f344m9/src
+jupyter-events @ file:///home/conda/feedstock_root/build_artifacts/bld/rattler-build_jupyter_events_1738765986/work
+jupyter-lsp==2.2.5
+jupyter_client @ file:///home/conda/feedstock_root/build_artifacts/jupyter_client_1733440914442/work
+jupyter_core @ file:///home/conda/feedstock_root/build_artifacts/jupyter_core_1727163409502/work
+jupyter_server @ file:///home/conda/feedstock_root/build_artifacts/jupyter_server_1734702637701/work
+jupyter_server_mathjax @ file:///home/conda/feedstock_root/build_artifacts/jupyter-server-mathjax_1734509714511/work
+jupyter_server_terminals @ file:///home/conda/feedstock_root/build_artifacts/jupyter_server_terminals_1733427956852/work
+jupyterlab==4.3.6
+jupyterlab_pygments @ file:///home/conda/feedstock_root/build_artifacts/jupyterlab_pygments_1733328101776/work
+jupyterlab_server==2.27.3
+jupyterlab_widgets==3.0.13
+kiwisolver==1.4.8
+kymatio==0.3.0
+libmambapy @ file:///home/conda/feedstock_root/build_artifacts/mamba-split_1727883551957/work/libmambapy
+llvmlite==0.44.0
+MarkupSafe @ file:///home/conda/feedstock_root/build_artifacts/markupsafe_1733219680183/work
+matplotlib==3.10.1
+matplotlib-inline @ file:///home/conda/feedstock_root/build_artifacts/matplotlib-inline_1733416936468/work
+menuinst @ file:///home/conda/feedstock_root/build_artifacts/menuinst_1725359038078/work
+mistune @ file:///home/conda/feedstock_root/build_artifacts/bld/rattler-build_mistune_1742402716/work
+mpi4py==4.0.3
+mpmath==1.3.0
+msgpack==1.1.0
+narwhals==1.42.1
+nbclient @ file:///home/conda/feedstock_root/build_artifacts/nbclient_1734628800805/work
+nbconvert @ file:///home/conda/feedstock_root/build_artifacts/bld/rattler-build_nbconvert-core_1738067871/work
+nbdime @ file:///home/conda/feedstock_root/build_artifacts/nbdime_1734533951497/work
+nbformat @ file:///home/conda/feedstock_root/build_artifacts/nbformat_1733402752141/work
+nersc-pymon==0.3.0
+nest_asyncio @ file:///home/conda/feedstock_root/build_artifacts/nest-asyncio_1733325553580/work
+networkx==3.4.2
+ninja==1.11.1.3
+notebook==7.3.3
+notebook_shim==0.2.4
+numba==0.61.2
+numpy==2.2.3
+nvidia-cublas-cu12==12.4.5.8
+nvidia-cuda-cupti-cu12==12.4.127
+nvidia-cuda-nvrtc-cu12==12.4.127
+nvidia-cuda-runtime-cu12==12.4.127
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.2.1.3
+nvidia-curand-cu12==10.3.5.147
+nvidia-cusolver-cu12==11.6.1.9
+nvidia-cusparse-cu12==12.3.1.170
+nvidia-cusparselt-cu12==0.6.2
+nvidia-ml-py==12.570.86
+nvidia-nccl-cu12==2.21.5
+nvidia-nvjitlink-cu12==12.4.127
+nvidia-nvtx-cu12==12.4.127
+overrides @ file:///home/conda/feedstock_root/build_artifacts/overrides_1734587627321/work
+packaging @ file:///home/conda/feedstock_root/build_artifacts/packaging_1718189413536/work
+pandas==2.3.0
+pandocfilters==1.5.1
+parso @ file:///home/conda/feedstock_root/build_artifacts/parso_1733271261340/work
+pexpect @ file:///home/conda/feedstock_root/build_artifacts/pexpect_1733301927746/work
+pickleshare @ file:///home/conda/feedstock_root/build_artifacts/pickleshare_1733327343728/work
+pillow==11.1.0
+pkgutil_resolve_name @ file:///home/conda/feedstock_root/build_artifacts/pkgutil-resolve-name_1733344503739/work
+platformdirs @ file:///home/conda/feedstock_root/build_artifacts/platformdirs_1715777629804/work
+plotly==6.1.2
+pluggy @ file:///home/conda/feedstock_root/build_artifacts/pluggy_1713667077545/work
+prometheus_client @ file:///home/conda/feedstock_root/build_artifacts/prometheus_client_1733327310477/work
+prompt_toolkit @ file:///home/conda/feedstock_root/build_artifacts/prompt-toolkit_1737453357274/work
+protobuf==5.29.3
+psutil @ file:///home/conda/feedstock_root/build_artifacts/psutil_1740663123172/work
+ptyprocess @ file:///home/conda/feedstock_root/build_artifacts/ptyprocess_1733302279685/work/dist/ptyprocess-0.7.0-py2.py3-none-any.whl#sha256=92c32ff62b5fd8cf325bec5ab90d7be3d2a8ca8c8a3813ff487a8d2002630d1f
+pure_eval @ file:///home/conda/feedstock_root/build_artifacts/pure_eval_1733569405015/work
+py-cpuinfo==9.0.0
+pycosat @ file:///home/conda/feedstock_root/build_artifacts/pycosat_1696355774225/work
+pycparser @ file:///home/conda/feedstock_root/build_artifacts/pycparser_1711811537435/work
+pydantic==2.10.6
+pydantic_core==2.27.2
+Pygments @ file:///home/conda/feedstock_root/build_artifacts/pygments_1736243443484/work
+pynndescent==0.5.13
+pyparsing==3.2.1
+PySocks @ file:///home/conda/feedstock_root/build_artifacts/pysocks_1661604839144/work
+python-dateutil @ file:///home/conda/feedstock_root/build_artifacts/python-dateutil_1733215673016/work
+python-json-logger @ file:///home/conda/feedstock_root/build_artifacts/python-json-logger_1677079630776/work
+pytz==2025.2
+PyYAML @ file:///home/conda/feedstock_root/build_artifacts/pyyaml_1737454647378/work
+pyzmq @ file:///home/conda/feedstock_root/build_artifacts/pyzmq_1738270962252/work
+referencing @ file:///home/conda/feedstock_root/build_artifacts/bld/rattler-build_referencing_1737836872/work
+regex==2024.11.6
+requests @ file:///home/conda/feedstock_root/build_artifacts/requests_1717057054362/work
+rfc3339_validator @ file:///home/conda/feedstock_root/build_artifacts/rfc3339-validator_1733599910982/work
+rfc3986-validator @ file:///home/conda/feedstock_root/build_artifacts/rfc3986-validator_1598024191506/work
+rpds-py==0.23.1
+rtree==1.4.1
+ruamel.yaml @ file:///home/conda/feedstock_root/build_artifacts/ruamel.yaml_1707298132558/work
+ruamel.yaml.clib @ file:///home/conda/feedstock_root/build_artifacts/ruamel.yaml.clib_1707314473810/work
+safetensors==0.5.3
+scikit-learn==1.6.1
+scipy==1.15.2
+Send2Trash @ file:///home/conda/feedstock_root/build_artifacts/send2trash_1733322040660/work
+sentry-sdk==2.22.0
+setproctitle==1.3.5
+setuptools==73.0.1
+shapely==2.1.2
+six @ file:///home/conda/feedstock_root/build_artifacts/six_1733380938961/work
+smmap @ file:///home/conda/feedstock_root/build_artifacts/smmap_1739781697784/work
+sniffio @ file:///home/conda/feedstock_root/build_artifacts/sniffio_1733244044561/work
+soupsieve==2.6
+stack_data @ file:///home/conda/feedstock_root/build_artifacts/stack_data_1733569443808/work
+sympy==1.13.1
+terminado @ file:///home/conda/feedstock_root/build_artifacts/terminado_1710262609923/work
+threadpoolctl==3.6.0
+tinycss2 @ file:///home/conda/feedstock_root/build_artifacts/tinycss2_1729802851396/work
+tokenizers==0.20.3
+torch==2.6.0
+torchaudio==2.6.0
+torchvision==0.21.0
+tornado @ file:///home/conda/feedstock_root/build_artifacts/tornado_1732615905931/work
+tqdm @ file:///home/conda/feedstock_root/build_artifacts/tqdm_1722737464726/work
+traitlets @ file:///home/conda/feedstock_root/build_artifacts/traitlets_1733367359838/work
+transformers==4.46.3
+trimesh==4.8.3
+triton==3.2.0
+truststore @ file:///home/conda/feedstock_root/build_artifacts/truststore_1724770958874/work
+types-python-dateutil @ file:///home/conda/feedstock_root/build_artifacts/types-python-dateutil_1733612335562/work
+typing_extensions @ file:///home/conda/feedstock_root/build_artifacts/typing_extensions_1733188668063/work
+typing_utils @ file:///home/conda/feedstock_root/build_artifacts/typing_utils_1733331286120/work
+tzdata==2025.2
+umap-learn==0.5.7
+uri-template @ file:///home/conda/feedstock_root/build_artifacts/uri-template_1733323593477/work/dist
+urllib3 @ file:///home/conda/feedstock_root/build_artifacts/urllib3_1719391292974/work
+wandb==0.18.3
+wcwidth @ file:///home/conda/feedstock_root/build_artifacts/wcwidth_1733231326287/work
+webcolors @ file:///home/conda/feedstock_root/build_artifacts/webcolors_1733359735138/work
+webencodings @ file:///home/conda/feedstock_root/build_artifacts/webencodings_1733236011802/work
+websocket-client @ file:///home/conda/feedstock_root/build_artifacts/websocket-client_1733157342724/work
+wheel==0.44.0
+widgetsnbextension==4.0.13
+zipp @ file:///home/conda/feedstock_root/build_artifacts/zipp_1732827521216/work
+zstandard==0.23.0

setup.py ADDED Viewed

	@@ -0,0 +1,14 @@

+from setuptools import setup
+from setuptools import find_packages, setup, Command
+with open('requirements.txt', 'r') as f:
+    requires = f.read().splitlines()
+setup(
+name='object_foundations',
+version = 0.0,
+install_requirements=requires,
+packages=find_packages(exclude=["tests", "*.tests", "*.tests.*", "tests.*"]),
+)

train_config.yaml ADDED Viewed

	@@ -0,0 +1,213 @@

+dataset_path: /global/cfs/cdirs/m4717/azton/galaxy-foundations/object_foundation/utils/supermock_dataset_11.2-14.json
+mask_token: 0
+masked_generation: false
+masking_prob:
+- 0.2
+- 0.2
+- 0.2
+- 0.2
+- 0.5
+- 0.5
+- 0.5
+modalities:
+- SFH
+- SED
+- mag_{band}_lsst
+- mag_{band}_spherex
+- redshift
+- halo_mass
+- stellar_mass
+model_config:
+  attention_probs_dropout_prob: 0.1
+  classifier_dropout: 0.0
+  contrastive_temperature: 0.05
+  hidden_dropout_prob: 0.1
+  hidden_size: 768
+  intermediate_size: 3072
+  loss_weights:
+    contrastive:
+      rounds: 0
+      w0T:
+      - 0
+      - 0
+    masked:
+      rounds: 0
+      w0T:
+      - 0.8
+      - 3
+    smooth:
+      rounds: 0
+      w0T:
+      - 0
+      - 0.3
+    unmasked:
+      rounds: 0
+      w0T:
+      - 0.2
+      - 0.3
+  max_position_embeddings: 1149
+  num_attention_heads: 12
+  num_hidden_layers: 18
+  pad_token_id: -1
+  transform_numeric: false
+  use_contrastive_loss: false
+  use_mlm_loss: true
+  use_regression_loss: false
+  use_xval_loss: false
+  vocab_size: 2048
+model_name_or_path: galaxybert
+tokenizer_name_or_path: Salesforce/SFR-Embedding-Mistral
+training_args:
+  _n_gpu: 1
+  accelerator_config:
+    dispatch_batches: null
+    even_batches: true
+    gradient_accumulation_kwargs: null
+    non_blocking: false
+    split_batches: false
+    use_configured_state: false
+    use_seedable_sampler: true
+  adafactor: false
+  adam_beta1: 0.9
+  adam_beta2: 0.999
+  adam_epsilon: 1.0e-08
+  auto_find_batch_size: false
+  average_tokens_across_devices: false
+  batch_eval_metrics: false
+  bf16: true
+  bf16_full_eval: false
+  data_seed: null
+  dataloader_drop_last: false
+  dataloader_num_workers: 16
+  dataloader_persistent_workers: false
+  dataloader_pin_memory: true
+  dataloader_prefetch_factor: 8
+  ddp_backend: null
+  ddp_broadcast_buffers: null
+  ddp_bucket_cap_mb: null
+  ddp_find_unused_parameters: null
+  ddp_timeout: 1800
+  debug: []
+  deepspeed: null
+  disable_tqdm: false
+  dispatch_batches: null
+  do_eval: true
+  do_predict: false
+  do_train: false
+  eval_accumulation_steps: 5
+  eval_delay: 0
+  eval_do_concat_batches: true
+  eval_on_start: false
+  eval_steps: 20
+  eval_strategy: !!python/object/apply:transformers.trainer_utils.IntervalStrategy
+  - steps
+  eval_use_gather_object: false
+  evaluation_strategy: null
+  fp16: false
+  fp16_backend: auto
+  fp16_full_eval: false
+  fp16_opt_level: O1
+  fsdp: []
+  fsdp_config:
+    min_num_params: 0
+    xla: false
+    xla_fsdp_grad_ckpt: false
+    xla_fsdp_v2: false
+  fsdp_min_num_params: 0
+  fsdp_transformer_layer_cls_to_wrap: null
+  full_determinism: false
+  gradient_accumulation_steps: 5
+  gradient_checkpointing: false
+  gradient_checkpointing_kwargs: null
+  greater_is_better: null
+  group_by_length: false
+  half_precision_backend: auto
+  hub_always_push: false
+  hub_model_id: null
+  hub_private_repo: false
+  hub_strategy: !!python/object/apply:transformers.trainer_utils.HubStrategy
+  - every_save
+  hub_token: null
+  ignore_data_skip: false
+  include_for_metrics: []
+  include_inputs_for_metrics: false
+  include_num_input_tokens_seen: false
+  include_tokens_per_second: false
+  jit_mode_eval: false
+  label_names: null
+  label_smoothing_factor: 0.0
+  learning_rate: 0.0001
+  length_column_name: length
+  load_best_model_at_end: false
+  local_rank: 0
+  log_level: passive
+  log_level_replica: warning
+  log_on_each_node: true
+  logging_dir: sm_foundation_lg_gmm_nomasklab
+  logging_first_step: true
+  logging_nan_inf_filter: true
+  logging_steps: 1
+  logging_strategy: !!python/object/apply:transformers.trainer_utils.IntervalStrategy
+  - steps
+  lr_scheduler_kwargs: {}
+  lr_scheduler_type: !!python/object/apply:transformers.trainer_utils.SchedulerType
+  - cosine
+  max_grad_norm: 1.0
+  max_steps: -1
+  metric_for_best_model: null
+  mp_parameters: ''
+  neftune_noise_alpha: null
+  no_cuda: false
+  num_train_epochs: 60
+  optim: !!python/object/apply:transformers.training_args.OptimizerNames
+  - adamw_torch
+  optim_args: null
+  optim_target_modules: null
+  output_dir: supermock_te60_
+  overwrite_output_dir: true
+  past_index: -1
+  per_device_eval_batch_size: 100
+  per_device_train_batch_size: 100
+  per_gpu_eval_batch_size: null
+  per_gpu_train_batch_size: null
+  prediction_loss_only: false
+  push_to_hub: false
+  push_to_hub_model_id: null
+  push_to_hub_organization: null
+  push_to_hub_token: null
+  ray_scope: last
+  remove_unused_columns: false
+  report_to:
+  - wandb
+  restore_callback_states_from_checkpoint: false
+  resume_from_checkpoint: null
+  run_name: NO_SHARD_b50
+  save_on_each_node: false
+  save_only_model: false
+  save_safetensors: true
+  save_steps: 30
+  save_strategy: !!python/object/apply:transformers.trainer_utils.IntervalStrategy
+  - steps
+  save_total_limit: 360
+  seed: 42
+  skip_memory_metrics: true
+  split_batches: null
+  tf32: null
+  torch_compile: false
+  torch_compile_backend: null
+  torch_compile_mode: null
+  torch_empty_cache_steps: null
+  torchdynamo: null
+  tpu_metrics_debug: false
+  tpu_num_cores: null
+  use_cpu: false
+  use_ipex: false
+  use_legacy_prediction_loop: false
+  use_liger_kernel: false
+  use_mps_device: false
+  warmup_ratio: 0.0
+  warmup_steps: 0
+  weight_decay: 0.1
+transform_numeric: false
+wandb_project: supermock-foundation-perl
+wandb_run_name: ''

utils/__init__.py ADDED Viewed

File without changes

utils/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (175 Bytes). View file

utils/__pycache__/masked_data_modeling_loss.cpython-312.pyc ADDED Viewed

Binary file (1.62 kB). View file

utils/__pycache__/yaml_util.cpython-312.pyc ADDED Viewed

Binary file (1.83 kB). View file

utils/masked_data_modeling_loss.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import torch
+from einops import rearrange
+'''
+    Simple class to do all MLM sort of loss operations in one place
+'''
+class MaskedDataLossWithSoftmax(torch.nn.Module):
+    def __init__(self, ignore: int=-100, reduction: str='mean', weight=None):
+        super(MaskedDataLossWithSoftmax, self).__init__()
+        self.loss = torch.nn.CrossEntropyLoss(ignore_index=-100,
+                    reduction=reduction,
+                    weight=weight)
+    def __call__(self, logits: torch.Tensor,
+                        labels: torch.Tensor
+                        )-> torch.Tensor:
+        """
+            Logits: [batch_size, seq_len, vocab_size]; without softmax applied
+            Labels should have -100 for all indices that are not part of masked tokens
+        """
+        logits = rearrange(logits, 'b s v -> b v s')
+        loss = self.loss(logits, labels)
+        return loss

utils/yaml_util.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import yaml
+class MyLoader(yaml.SafeLoader):
+    # returns
+    def construct_mapping(self, *args, **kwargs):
+        super().add_constructor(None, construct_undefined)
+        # when loading we want to skip keys that require construction,
+        mapping = super().construct_mapping(*args, **kwargs)
+        return mapping
+import typing
+class Tagged(typing.NamedTuple):
+    tag: str
+    value: object
+def construct_undefined(self, node):
+    if isinstance(node, yaml.nodes.ScalarNode):
+        value = self.construct_scalar(node)
+    elif isinstance(node, yaml.nodes.SequenceNode):
+        value = self.construct_sequence(node)
+    elif isinstance(node, yaml.nodes.MappingNode):
+        value = self.construct_mapping(node)
+    else:
+        assert False, f"unexpected node: {node!r}"
+    return Tagged(node.tag, value)