upload TE checkpoint

Browse files

Files changed (5) hide show

LICENSE +178 -0
README.md +167 -3
config.json +35 -0
geneformer.py +930 -0
model.safetensors +3 -0

LICENSE ADDED Viewed

	@@ -0,0 +1,178 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright 2022 Theodoris Lab, Gladstone Institute and The HuggingFace Inc. team. All rights reserved.
+   Copyright 2025 NVIDIA CORPORATION. All rights reserved.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md CHANGED Viewed

@@ -1,3 +1,167 @@
----
-license: apache-2.0
----

+---
+datasets: ctheodoris/Genecorpus-30M
+library_name: transformers
+license: apache-2.0
+tags:
+  - single-cell
+  - genomics
+---
+# Geneformer-10M (TransformerEngine-Optimized) Overview
+## Description:
+Geneformer is a foundational transformer model pretrained on a large-scale corpus of single-cell transcriptomes to enable context-specific predictions in settings with limited data in network biology.
+This version of the Geneformer model is optimized with NVIDIA's [TransformerEngine](https://github.com/NVIDIA/TransformerEngine) library. It is based on the original Geneformer V1 model, and (within numerical precision) has identical weights and outputs.
+This model is ready for commercial/non-commercial use.
+## Third-Party Community Consideration
+This model is not owned or developed by NVIDIA. This model has been developed and built to a third-party's requirements for this application and use case; see link to Non-NVIDIA Model Card [Geneformer Model Card](https://huggingface.co/ctheodoris/Geneformer).
+### License/Terms of Use:
+Geneformer is licensed under the [Apache 2.0 license](https://huggingface.co/datasets/choosealicense/licenses/blob/main/markdown/apache-2.0.md).
+### Deployment Geography:
+Global
+### Use Case:
+Network biology and therapeutic discovery, particularly in data-limited settings such as rare diseases or diseases affecting hard-to-access tissues.
+### Release Date:
+Hugging Face 12/19/2025 via [https://huggingface.co/nvidia/geneformer_V1_10M](https://huggingface.co/nvidia/geneformer_V1_10M)
+## Reference(s):
+* [Transfer learning enables predictions in network biology](https://www.nature.com/articles/s41586-023-06139-9.epdf?sharing_token=u_5LUGVkd3A8zR-f73lU59RgN0jAjWel9jnR3ZoTv0N2UB4yyXENUK50s6uqjXH69sDxh4Z3J4plYCKlVME-W2WSuRiS96vx6t5ex2-krVDS46JkoVvAvJyWtYXIyj74pDWn_DutZq1oAlDaxfvBpUfSKDdBPJ8SKlTId8uT47M%3D) - details of the original model trained on ~30 million transcriptomes in June 2021 and the initial report of the in silico perturbation and cell and gene classification strategies.
+* [Quantized multi-task learning for context-specific representations of gene network dynamics](https://www.biorxiv.org/content/10.1101/2024.08.16.608180v1.full.pdf) -  the expanded model, trained on ~104 million transcriptomes, and continual learning, multitask learning, and quantization strategies.
+* See [geneformer.readthedocs.io](https://geneformer.readthedocs.io/) for documentation.
+## Model Architecture:
+**Architecture Type:** Transformer
+**Network Architecture:** BERT
+**This model was developed based on:** [Geneformer](https://huggingface.co/ctheodoris/Geneformer) <br>
+**Number of model parameters:** 1 x 10^7
+## Input:
+**Input Type:** Number (Row represents cell, containing gene names and single cell expression counts)  <br>
+**Input Format:** Array [AnnData](https://anndata.readthedocs.io/en/latest/)  <br>
+**Input Parameters:** One-Dimensional (1D) <br>
+**Other Properties Related to Input:** This model supports a context length of 2048.
+## Output:
+**Output Type:** Dense Embedding Predictions <br>
+**Output Format:** Vector <br>
+**Output Parameters:** One-Dimensional (1D) <br>
+**Other Properties Related to Output:** Numeric floating point vector (fp16, bf16, or fp32); Geneformer-10M outputs 256 dimensional embeddings.
+Our AI models are designed and/or optimized to run on NVIDIA GPU-accelerated systems. By leveraging NVIDIA’s hardware (e.g. GPU cores) and software frameworks (e.g., CUDA libraries), the model achieves faster training and inference times compared to CPU-only solutions.
+## Software Integration:
+**Runtime Engine(s):**
+* Transformer Engine
+* PyTorch
+**Supported Hardware Microarchitecture Compatibility:**
+* A100
+* H100
+* H200
+* GB200
+**Preferred/Supported Operating System(s):**
+* Linux
+The integration of foundation and fine-tuned models into AI systems requires additional testing using use-case-specific data to ensure safe and effective deployment. Following the V-model methodology, iterative testing and validation at both unit and system levels are essential to mitigate risks, meet technical and functional requirements, and ensure compliance with safety and ethical standards before deployment.
+## Model Version(s):
+* Geneformer-V1-10M
+* Geneformer-V2-104M
+* Geneformer-V2-316M
+* Geneformer-V2-104M_CLcancer
+## Training and Evaluation Datasets:
+## Training Datasets:
+**Link:** [Genecorpus-30M](https://huggingface.co/datasets/ctheodoris/Genecorpus-30M)
+**Data Modality:**
+* Text (Human single-cell transcriptomes)
+**Text Training Data Size:**
+* 1 Billion to 10 Trillion Tokens
+**Data Collection Method by dataset:**
+* Human
+**Labeling Method by dataset:**
+* N/A
+**Properties:** The single-cell transcriptomes were assembled from a broad range of publicly available data sources. The researchers collected raw counts from sources like NCBI Gene Expression Omnibus (GEO), Human Cell Atlas, and Tumor Immune Single-cell Hub (TISCH), among others. They excluded cells with high mutational burdens, such as malignant cells and immortalized cell lines, and included only droplet-based sequencing platforms to ensure data comparability. The raw data was then converted into a uniform loom HDF5 file format.
+## Evaluation Datasets:
+**Link:** [A cross-disorder dosage sensitivity map of the human genome](https://zenodo.org/records/6347673)
+**Data Collection Method by dataset:**
+* Human
+**Labeling Method by dataset:**
+* Not Applicable <!-- there are no labels for this dataset -->
+**Properties:** The data was collected by harmonizing and meta-analyzing rare copy-number variants (rCNVs) from nearly one million individuals across 54 different disorders. This approach created a genome-wide catalog of dosage sensitivity.
+**Link:** [Single-cell Transcriptome Analysis Reveals Dynamic Cell Populations and Differential Gene Expression Patterns in Control and Aneurysmal Human Aortic Tissue](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE155468)
+**Data Collection Method by dataset:**
+* Human
+**Labeling Method by dataset:**
+* Human
+**Properties:** The data was collected by performing single-cell RNA sequencing (scRNA-seq) on human ascending aortic tissues. Tissues were obtained from 11 study participants, consisting of 8 patients with ascending thoracic aortic aneurysm (ATAA) and 3 control subjects.
+**Link:** [Systematic Comparison of High-throughput Single-Cell and Single-Nucleus Transcriptomes during Cardiomyocyte Differentiation](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE129096)
+**Data Collection Method by dataset:**
+* Automated
+**Labeling Method by dataset:**
+* Human
+**Properties:** The researchers used two different sequencing platforms to collect data from the same biological process: induced pluripotent stem cell (iPSC) differentiation into cardiomyocytes. The two platforms used were Drop-seq (single-cell) and DroNc-seq (single-nucleus). The study involved two iPSC lines and collected data over a 15-day time period.
+**Link:** [A human cell atlas of fetal gene expression](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE156793)
+**Data Collection Method by dataset:**
+* Human
+**Labeling Method by dataset:**
+* Hybrid: Human, Automated
+**Properties:** The data was collected by profiling the gene expression of millions of single cells from 15 different human fetal organs.
+**Link:** [Single-nuclei profiling of human dilated and hypertrophic cardiomyopathy](https://singlecell.broadinstitute.org/single_cell/study/SCP1303/single-nuclei-profiling-of-human-dilated-and-hypertrophic-cardiomyopathy#study-summary)
+**Data Collection Method by dataset:**
+* Human
+**Labeling Method by dataset:**
+* Hybrid: Human, Automated
+**Properties:** The data was collected by performing single-nucleus RNA sequencing (snRNA-seq) on left ventricle samples from human hearts. The study included samples from 11 hearts with dilated cardiomyopathy, 15 hearts with hypertrophic cardiomyopathy, and 16 non-failing hearts. In total, nearly 600,000 nuclei were sequenced.
+## Inference:
+**Acceleration Engine:** Transformer Engine, PyTorch
+**Test Hardware:**
+* A100
+* H100
+* H200
+* GB200
+## Ethical Considerations:
+NVIDIA believes Trustworthy AI is a shared responsibility and we have established policies and practices to enable development for a wide array of AI applications.  When downloaded or used in accordance with our terms of service, developers should work with their internal model team to ensure this model meets requirements for the relevant industry and use case and addresses unforeseen product misuse.
+Users are responsible for ensuring the physical properties of model-generated molecules are appropriately evaluated and comply with applicable safety regulations and ethical standards.
+Please report model quality, risk, security vulnerabilities or NVIDIA AI Concerns [here](https://www.nvidia.com/en-us/support/submit-security-vulnerability/).

config.json ADDED Viewed

	@@ -0,0 +1,35 @@

+{
+  "_attn_implementation_autoset": true,
+  "architectures": [
+    "BertForMaskedLM"
+  ],
+  "attention_probs_dropout_prob": 0.02,
+  "auto_map": {
+    "AutoConfig": "geneformer.TEBertConfig",
+    "AutoModel": "geneformer.BertModel",
+    "AutoModelForMaskedLM": "geneformer.BertForMaskedLM"
+  },
+  "classifier_dropout": null,
+  "framework": "pytorch",
+  "fuse_qkv_params": true,
+  "gradient_checkpointing": false,
+  "hidden_act": "relu",
+  "hidden_dropout_prob": 0.02,
+  "hidden_size": 256,
+  "initializer_range": 0.02,
+  "intermediate_size": 512,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 2048,
+  "micro_batch_size": null,
+  "model_type": "bert",
+  "num_attention_heads": 4,
+  "num_hidden_layers": 6,
+  "pad_token_id": 0,
+  "position_embedding_type": "absolute",
+  "torch_dtype": "float32",
+  "transformers_version": "4.51.3",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "use_te_layers": true,
+  "vocab_size": 25426
+}

geneformer.py ADDED Viewed

	@@ -0,0 +1,930 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-Apache2
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch BERT model with and without transformer engine layers.
+This file is a modified version of the BERT model from the Hugging Face Transformers library.
+It includes a custom BERT encoder that can be used with or without transformer engine layers.
+The BERT encoder is a modified version of the encoder from the Hugging Face Transformers library.
+It includes a custom BERT layer that can be used with or without transformer engine layers.
+"""
+from typing import ClassVar, List, Optional, Tuple, Union
+import torch
+import transformer_engine.pytorch as te
+from torch import nn
+from torch.nn import CrossEntropyLoss
+from transformers.modeling_attn_mask_utils import (
+    _prepare_4d_attention_mask_for_sdpa,
+    _prepare_4d_causal_attention_mask_for_sdpa,
+)
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    MaskedLMOutput,
+)
+from transformers.models.bert.configuration_bert import BertConfig
+from transformers.models.bert.modeling_bert import (
+    BertEmbeddings,
+    BertLayer,
+    BertOnlyMLMHead,
+    BertPooler,
+    BertPreTrainedModel,
+)
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+_CHECKPOINT_FOR_DOC = "google-bert/bert-base-uncased"
+_CONFIG_FOR_DOC = "BertConfig"
+class TEBertConfig(BertConfig):
+    """Configuration class for the TE BERT model.
+    This class is a subclass of BertConfig, and it adds the following attributes:
+    - torch_dtype: The dtype of the model parameters.
+    - use_te_layers: Whether to use the TE layers.
+    - micro_batch_size: The micro batch size for TE layers.
+    """
+    def __init__(self, **kwargs):
+        """Initialize the TEBertConfig.
+        Args:
+            **kwargs: Additional keyword arguments to pass to BertConfig.
+        """
+        super().__init__(**kwargs)
+        # TODO(@jomitchell): Fix this in JIRA BIONEMO-2406
+        torch_dtype = kwargs.get("torch_dtype", torch.bfloat16)
+        # Convert string dtype to torch dtype if needed
+        if isinstance(torch_dtype, str):
+            if torch_dtype == "bfloat16":
+                torch_dtype = torch.bfloat16
+            elif torch_dtype == "float16":
+                torch_dtype = torch.float16
+            elif torch_dtype == "float32":
+                torch_dtype = torch.float32
+            else:
+                raise ValueError(f"Unsupported dtype: {torch_dtype}")
+        self.torch_dtype = torch_dtype
+        self.use_te_layers = kwargs.get("use_te_layers", False)
+        self.micro_batch_size = kwargs.get("micro_batch_size", None)
+        self.fuse_qkv_params = kwargs.get("fuse_qkv_params", False)
+class TEBertLayer(nn.Module):
+    """Custom BERT layer using individual TE components for correct post-norm architecture.
+    This builds a BERT-style post-norm layer using:
+    - te.MultiheadAttention (with input_layernorm=False)
+    - te.LayerNorm for post-attention normalization as layernorm
+    - te.Linear for MLP layers (fc1, fc2) wrapped in layernorm_mlp module
+    - te.LayerNorm for post-MLP normalization as layernorm_mlp.layer_norm
+    Parameter naming matches convert.py expectations for weight loading from HF checkpoints.
+    DIVERGENCE FROM TYPICAL TRANSFORMERLAYER:
+    This implementation uses POST-norm architecture, which differs significantly from the
+    typical TransformerLayer that uses PRE-norm.
+    Geneformer/HF BERT (POST-norm, output_layernorm=True equivalent):
+        Input -> Attention -> Dropout -> Residual Add -> LayerNorm
+              -> MLP -> Dropout -> Residual Add -> LayerNorm -> Output
+    Typical TransformerLayer (PRE-norm, output_layernorm=False default):
+        Input -> [LayerNorm Attn inside MultiheadAttention] -> Dropout -> Residual Add
+              -> [LayerNorm MLP inside LayerNormMLP] -> Dropout -> Residual Add -> Output
+    Geneformer applies LayerNorm AFTER residual connections as
+    explicit separate modules, whereas typical TransformerLayer applies LayerNorm Before
+    operations via input_layernorm=True inside MultiheadAttention and LayerNormMLP modules.
+    For more information, see:
+    https://github.com/NVIDIA/TransformerEngine/blob/dd9433e7ad28c12f27da9770be54c9c584e85fa0/transformer_engine/pytorch/transformer.py#L822
+    """
+    def __init__(self, config, layer_number=None):
+        """Initialize the TEBertLayer.
+        Args:
+            config: Configuration object containing model parameters.
+            layer_number: Optional layer number for identification.
+        """
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.num_attention_heads = config.num_attention_heads
+        self.layer_number = layer_number
+        self.is_decoder = config.is_decoder
+        self.add_cross_attention = config.add_cross_attention
+        # Self-attention using TE MultiheadAttention
+        self.self_attention = te.MultiheadAttention(
+            hidden_size=config.hidden_size,
+            num_attention_heads=config.num_attention_heads,
+            num_gqa_groups=config.num_attention_heads,
+            attention_dropout=config.attention_probs_dropout_prob,
+            input_layernorm=False,  # No LayerNorm before attention
+            attention_type="self",
+            layer_number=layer_number,
+            attn_mask_type="padding",
+            params_dtype=config.torch_dtype,
+            fuse_qkv_params=getattr(config, "fuse_qkv_params", False),
+            window_size=(-1, -1),  # No sliding window attention
+            qkv_format="bshd",  #  BERT uses [batch, seq, head, dim]
+        )
+        # Post-attention TE LayerNorm
+        self.layernorm = te.LayerNorm(
+            normalized_shape=config.hidden_size,
+            eps=config.layer_norm_eps,
+            params_dtype=config.torch_dtype,
+        )
+        # MLP using TE Linear layers
+        self.layernorm_mlp = nn.Module()
+        self.layernorm_mlp.fc1 = te.Linear(
+            config.hidden_size,
+            config.intermediate_size,
+            bias=True,
+            params_dtype=config.torch_dtype,
+        )
+        if config.hidden_act != "relu":
+            raise ValueError(f"Geneformer requires hidden_act='relu', got '{config.hidden_act}'")
+        self.layernorm_mlp.activation = nn.ReLU()
+        self.layernorm_mlp.fc2 = te.Linear(
+            config.intermediate_size,
+            config.hidden_size,
+            bias=True,
+            params_dtype=config.torch_dtype,
+        )
+        # Post-MLP LayerNorm
+        self.layernorm_mlp.layer_norm = te.LayerNorm(
+            normalized_shape=config.hidden_size,
+            eps=config.layer_norm_eps,
+            params_dtype=config.torch_dtype,
+        )
+        # Dropout
+        self.attention_dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.mlp_dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        """Forward pass through the TE BERT layer.
+        Architecture
+            Input
+            → Self-Attention
+            → Dropout
+            → Residual Connection
+            → LayerNorm
+            → MLP
+            → Dropout
+            → Residual Connection
+            → LayerNorm
+            → Output
+        This architecture is the key divergence from typical TransformerLayer
+        (with output_layernorm=False default) which uses PRE-norm.
+        In PRE-norm TransformerLayer, LayerNorm is applied Before operations:
+            - MultiheadAttention with input_layernorm=True applies LayerNorm internally before attention
+            - LayerNormMLP applies LayerNorm internally before MLP
+            - Residuals bypass these internal LayerNorms
+        In Geneformer's POST-norm, LayerNorm is applied after residual connections as explicit
+        separate modules, meaning the normalized output flows to the next layer.
+        Args:
+            hidden_states: Input hidden states.
+            attention_mask: Attention mask.
+            head_mask: Head mask.
+            encoder_hidden_states: Encoder hidden states.
+            encoder_attention_mask: Encoder attention mask.
+            past_key_value: Past key value.
+            output_attentions: Whether to output attentions.
+        Returns:
+            Tuple of tensors containing the layer output.
+        """
+        # Attention mask handling for TE MultiheadAttention,  [batch, 1, 1, seq_len], True=masked, False=attend
+        te_attention_mask = None
+        te_mask_type = "no_mask"
+        if attention_mask is not None:
+            # Check if there's actual padding (not all 1s for 2D or not all 0s for 4D)
+            if attention_mask.dim() == 2:
+                # Standard [batch, seq_len] where 1=attend, 0=masked
+                has_padding = not torch.all(attention_mask == 1)
+                if has_padding:
+                    # Convert to TE format: [batch, 1, 1, seq_len], invert polarity
+                    te_attention_mask = ~attention_mask.bool().unsqueeze(1).unsqueeze(1)
+                    te_mask_type = "padding"
+            elif attention_mask.dim() in [3, 4]:
+                # Extended mask with -inf for masked positions
+                has_masking = torch.any(
+                    attention_mask < -10000.0
+                )  # Check if it's not a trivial mask (all zeros/no masking)
+                if has_masking:
+                    # Extract padding mask and convert to TE format
+                    if attention_mask.dim() == 4:
+                        padding_mask = attention_mask[:, 0, 0, :]  # [batch, seq_len]
+                    else:  # dim == 3
+                        padding_mask = attention_mask[:, 0, :]  # [batch, seq_len]
+                    # -inf to True (masked), 0 to False (attend)
+                    # Then reshape to [batch, 1, 1, seq_len]
+                    te_attention_mask = (padding_mask < -10000.0).unsqueeze(1).unsqueeze(1)
+                    te_mask_type = "padding"
+        # Self-Attention sub-layer
+        attention_output = self.self_attention(
+            hidden_states,
+            attention_mask=te_attention_mask,
+            attn_mask_type=te_mask_type,
+        )
+        # Residual connection + dropout + LayerNorm (POST-norm)
+        attention_output = self.attention_dropout(attention_output)
+        hidden_states = hidden_states + attention_output
+        hidden_states = self.layernorm(hidden_states)
+        # MLP sub-layer
+        mlp_output = self.layernorm_mlp.fc1(hidden_states)
+        mlp_output = self.layernorm_mlp.activation(mlp_output)
+        mlp_output = self.layernorm_mlp.fc2(mlp_output)
+        # Residual connection + dropout + LayerNorm (POST-norm)
+        mlp_output = self.mlp_dropout(mlp_output)
+        hidden_states = hidden_states + mlp_output
+        hidden_states = self.layernorm_mlp.layer_norm(hidden_states)
+        return (hidden_states,)
+class BertEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        if self.config.use_te_layers:
+            self.layer = nn.ModuleList(
+                [TEBertLayer(config, layer_number=i + 1) for i in range(config.num_hidden_layers)]
+            )
+        else:
+            self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+    def _process_layer_outputs(
+        self,
+        layer_outputs,
+        hidden_states,
+        all_hidden_states,
+        all_self_attentions,
+        all_cross_attentions,
+        output_hidden_states,
+        output_attentions,
+        use_cache,
+        next_decoder_cache,
+    ):
+        """Process outputs from a single layer."""
+        hidden_states = layer_outputs[0]
+        if use_cache and next_decoder_cache is not None:
+            next_decoder_cache = (*next_decoder_cache, layer_outputs[-1])
+        if output_attentions and len(layer_outputs) > 1:
+            if all_self_attentions is None:
+                all_self_attentions = (layer_outputs[1],)
+            else:
+                all_self_attentions = (*all_self_attentions, layer_outputs[1])
+            if self.config.add_cross_attention and len(layer_outputs) > 2:
+                if all_cross_attentions is None:
+                    all_cross_attentions = (layer_outputs[2],)
+                else:
+                    all_cross_attentions = (*all_cross_attentions, layer_outputs[2])
+        return hidden_states, all_hidden_states, all_self_attentions, all_cross_attentions, next_decoder_cache
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = False,
+        output_hidden_states: Optional[bool] = False,
+        return_dict: Optional[bool] = True,
+    ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+        next_decoder_cache = () if use_cache else None
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                if all_hidden_states is None:
+                    all_hidden_states = (hidden_states,)
+                else:
+                    all_hidden_states = (*all_hidden_states, hidden_states)
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[i] if past_key_values is not None else None
+            if self.gradient_checkpointing and self.training:
+                from torch.utils.checkpoint import checkpoint
+                layer_outputs = checkpoint(
+                    layer_module,
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                    use_reentrant=False,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                )
+            hidden_states, all_hidden_states, all_self_attentions, all_cross_attentions, next_decoder_cache = (
+                self._process_layer_outputs(
+                    layer_outputs,
+                    hidden_states,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                    output_hidden_states,
+                    output_attentions,
+                    use_cache,
+                    next_decoder_cache,
+                )
+            )
+        if output_hidden_states:
+            if all_hidden_states is None:
+                all_hidden_states = (hidden_states,)
+            else:
+                all_hidden_states = (*all_hidden_states, hidden_states)
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_decoder_cache,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+class BertModel(BertPreTrainedModel):
+    """BERT model for encoding and decoding.
+    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
+    cross-attention is added between the self-attention layers, following the architecture described in [Attention is
+    all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
+    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
+    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
+    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
+    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
+    """
+    config_class = TEBertConfig
+    # TODO(@jomitchell) Can start swapping layers here for TE layers.
+    _no_split_modules: ClassVar[List[str]] = ["BertEmbeddings", "BertLayer", "TEBertLayer"]
+    def __init__(self, config, add_pooling_layer=True):
+        """Initialize the BertModel.
+        Args:
+            config: Configuration object containing model parameters.
+            add_pooling_layer: Whether to add a pooling layer on top of the encoder.
+        """
+        super().__init__(config)
+        self.config = config
+        self.embeddings = BertEmbeddings(config)
+        self.encoder = BertEncoder(config)
+        self.pooler = BertPooler(config) if add_pooling_layer else None
+        self.attn_implementation = config._attn_implementation
+        self.position_embedding_type = config.position_embedding_type
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        """Get the input embeddings."""
+        return self.embeddings.word_embeddings
+    def set_input_embeddings(self, value):
+        """Set the input embeddings."""
+        self.embeddings.word_embeddings = value
+    def _prune_heads(self, heads_to_prune):
+        """Prunes heads of the model.
+        heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel.
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+    def _validate_and_prepare_inputs(
+        self,
+        input_ids,
+        inputs_embeds,
+        attention_mask,
+        token_type_ids,
+        position_ids,
+        past_key_values,
+    ):
+        """Validate inputs and prepare basic input data."""
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+        batch_size, seq_length = input_shape
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+        # past_key_values_length
+        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+        if token_type_ids is None:
+            if hasattr(self.embeddings, "token_type_ids"):
+                buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+        )
+        if attention_mask is None:
+            attention_mask = torch.ones((batch_size, seq_length + past_key_values_length), device=device)
+        return (
+            input_shape,
+            batch_size,
+            seq_length,
+            device,
+            past_key_values_length,
+            token_type_ids,
+            embedding_output,
+            attention_mask,
+        )
+    def _prepare_attention_masks(
+        self,
+        attention_mask,
+        input_shape,
+        embedding_output,
+        past_key_values_length,
+        seq_length,
+        device,
+        head_mask,
+        output_attentions,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        """Prepare attention masks for the forward pass."""
+        use_sdpa_attention_masks = (
+            self.attn_implementation == "sdpa"
+            and self.position_embedding_type == "absolute"
+            and head_mask is None
+            and not output_attentions
+        )
+        # Expand the attention mask
+        if use_sdpa_attention_masks and attention_mask.dim() == 2:
+            # Expand the attention mask for SDPA.
+            # [bsz, seq_len] -> [bsz, 1, seq_len, seq_len]
+            if self.config.is_decoder:
+                extended_attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
+                    attention_mask,
+                    input_shape,
+                    embedding_output,
+                    past_key_values_length,
+                )
+            else:
+                extended_attention_mask = _prepare_4d_attention_mask_for_sdpa(
+                    attention_mask, embedding_output.dtype, tgt_len=seq_length
+                )
+        else:
+            # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+            # ourselves in which case we just need to make it broadcastable to all heads.
+            extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape)
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.config.is_decoder and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+            if use_sdpa_attention_masks and encoder_attention_mask.dim() == 2:
+                # Expand the attention mask for SDPA.
+                # [bsz, seq_len] -> [bsz, 1, seq_len, seq_len]
+                encoder_extended_attention_mask = _prepare_4d_attention_mask_for_sdpa(
+                    encoder_attention_mask, embedding_output.dtype, tgt_len=seq_length
+                )
+            else:
+                encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+        return extended_attention_mask, encoder_extended_attention_mask
+    def _prepare_inputs_and_masks(
+        self,
+        input_ids,
+        inputs_embeds,
+        attention_mask,
+        token_type_ids,
+        position_ids,
+        head_mask,
+        past_key_values,
+        encoder_hidden_states,
+        encoder_attention_mask,
+        output_attentions,
+        output_hidden_states,
+        use_cache,
+        return_dict,
+    ):
+        """Prepare inputs and attention masks for the forward pass."""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if self.config.is_decoder:
+            use_cache = use_cache if use_cache is not None else self.config.use_cache
+        else:
+            use_cache = False
+        (
+            input_shape,
+            batch_size,
+            seq_length,
+            device,
+            past_key_values_length,
+            token_type_ids,
+            embedding_output,
+            attention_mask,
+        ) = self._validate_and_prepare_inputs(
+            input_ids,
+            inputs_embeds,
+            attention_mask,
+            token_type_ids,
+            position_ids,
+            past_key_values,
+        )
+        extended_attention_mask, encoder_extended_attention_mask = self._prepare_attention_masks(
+            attention_mask,
+            input_shape,
+            embedding_output,
+            past_key_values_length,
+            seq_length,
+            device,
+            head_mask,
+            output_attentions,
+            encoder_hidden_states,
+            encoder_attention_mask,
+        )
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        processed_head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+        return (
+            embedding_output,
+            extended_attention_mask,
+            processed_head_mask,
+            encoder_extended_attention_mask,
+            use_cache,
+            return_dict,
+        )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
+        r"""Forward pass of the BertModel.
+        Args:
+            input_ids (`torch.Tensor`, *optional*): Input token IDs.
+            attention_mask (`torch.Tensor`, *optional*): Attention mask.
+            token_type_ids (`torch.Tensor`, *optional*): Token type IDs.
+            position_ids (`torch.Tensor`, *optional*): Position IDs.
+            head_mask (`torch.Tensor`, *optional*): Head mask.
+            inputs_embeds (`torch.Tensor`, *optional*): Input embeddings.
+            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+                the model is configured as a decoder.
+            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+                the model is configured as a decoder.
+            encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, target_length)`, *optional*):
+                Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+                the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+            past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+                Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding (see
+                `past_key_values`).
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+                `past_key_values`).
+            output_attentions (`bool`, *optional*): Whether to output attentions.
+            output_hidden_states (`bool`, *optional*): Whether to output hidden states.
+            return_dict (`bool`, *optional*): Whether to return a ModelOutput instead of a tuple.
+        """
+        (
+            embedding_output,
+            extended_attention_mask,
+            processed_head_mask,
+            encoder_extended_attention_mask,
+            use_cache,
+            return_dict,
+        ) = self._prepare_inputs_and_masks(
+            input_ids,
+            inputs_embeds,
+            attention_mask,
+            token_type_ids,
+            position_ids,
+            head_mask,
+            past_key_values,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            output_attentions,
+            output_hidden_states,
+            use_cache,
+            return_dict,
+        )
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=processed_head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+        if not return_dict:
+            return (sequence_output, pooled_output, *encoder_outputs[1:])
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+class BertForMaskedLM(BertPreTrainedModel):
+    """BERT model for masked language modeling."""
+    config_class = TEBertConfig
+    _tied_weights_keys: ClassVar[List[str]] = ["predictions.decoder.bias", "cls.predictions.decoder.weight"]
+    def __init__(self, config):
+        """Initialize the BertForMaskedLM.
+        Args:
+            config: Configuration object containing model parameters.
+        """
+        super().__init__(config)
+        if config.is_decoder:
+            logger.warning(
+                "If you want to use `BertForMaskedLM` make sure `config.is_decoder=False` for "
+                "bi-directional self-attention."
+            )
+        self.bert = BertModel(config, add_pooling_layer=False)
+        self.cls = BertOnlyMLMHead(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_output_embeddings(self):
+        """Get the output embeddings."""
+        return self.cls.predictions.decoder
+    def set_output_embeddings(self, new_embeddings):
+        """Set the output embeddings."""
+        self.cls.predictions.decoder = new_embeddings
+        self.cls.predictions.bias = new_embeddings.bias
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], MaskedLMOutput]:
+        r"""Forward pass for masked language modeling.
+        Labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+        Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+        config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+        loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()  # -100 index = padding token
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+        if not return_dict:
+            output = (prediction_scores, *outputs[2:])
+            return (masked_lm_loss, *output) if masked_lm_loss is not None else output
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_kwargs):
+        """Prepare inputs for generation."""
+        input_shape = input_ids.shape
+        effective_batch_size = input_shape[0]
+        #  add a dummy token
+        if self.config.pad_token_id is None:
+            raise ValueError("The PAD token should be defined for generation")
+        attention_mask = torch.cat(
+            [attention_mask, attention_mask.new_zeros((attention_mask.shape[0], 1))],
+            dim=-1,
+        )
+        dummy_token = torch.full(
+            (effective_batch_size, 1),
+            self.config.pad_token_id,
+            dtype=torch.long,
+            device=input_ids.device,
+        )
+        input_ids = torch.cat([input_ids, dummy_token], dim=1)
+        return {"input_ids": input_ids, "attention_mask": attention_mask}
+    @classmethod
+    def can_generate(cls) -> bool:
+        """Legacy correction: BertForMaskedLM can't call `generate()` from `GenerationMixin`.
+        Even though it has a `prepare_inputs_for_generation` method.
+        """
+        return False
+__all__ = [
+    "BertForMaskedLM",
+    "BertLayer",
+    "BertModel",
+    "TEBertLayer",
+]

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d0dfca398accc3b54ce8d9d1574bf92f393a9289426563cd68fdd2666dd74f09
+size 67302248