thivy commited on Feb 14

Commit

d83a67e

verified ·

1 Parent(s): 8723c4e

Training in progress, step 500, checkpoint

Browse files

Files changed (18) hide show

last-checkpoint/1_SpladePooling/config.json +5 -0
last-checkpoint/README.md +554 -0
last-checkpoint/config.json +43 -0
last-checkpoint/config_sentence_transformers.json +14 -0
last-checkpoint/configuration_gptbert.py +34 -0
last-checkpoint/model.safetensors +3 -0
last-checkpoint/modeling_gptbert.py +1105 -0
last-checkpoint/modules.json +14 -0
last-checkpoint/optimizer.pt +3 -0
last-checkpoint/rng_state_0.pth +3 -0
last-checkpoint/rng_state_1.pth +3 -0
last-checkpoint/scheduler.pt +3 -0
last-checkpoint/sentence_bert_config.json +4 -0
last-checkpoint/special_tokens_map.json +51 -0
last-checkpoint/tokenizer.json +0 -0
last-checkpoint/tokenizer_config.json +143 -0
last-checkpoint/trainer_state.json +185 -0
last-checkpoint/training_args.bin +3 -0

last-checkpoint/1_SpladePooling/config.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+    "pooling_strategy": "max",
+    "activation_function": "relu",
+    "word_embedding_dimension": 51200
+}

last-checkpoint/README.md ADDED Viewed

	@@ -0,0 +1,554 @@

+---
+language:
+- 'no'
+- da
+- sv
+license: mit
+tags:
+- sentence-transformers
+- sparse-encoder
+- sparse
+- splade
+- generated_from_trainer
+- dataset_size:333547
+- loss:SpladeLoss
+- loss:SparseMultipleNegativesRankingLoss
+- loss:FlopsLoss
+base_model: ltg/norbert4-base
+widget:
+- text: "\n    \nJeg begyndte at forstå, hvilke vældige kræfter   min lille historie\
+    \ havde sluppet løs.\n    \n  "
+- text: "\n    \nIfølge Empires job-bibel skal en direktør-assistent ikke dække bord.\n\
+    \    \n  "
+- text: "\n    \nDet kan du da ikke gøre!\n  "
+- text: "\n    \nJeg må købe flere sherbet fountains.\n    \n  "
+- text: Søren Kierkegaard, den danske filosof og teolog, var dybt fascineret af begrebet
+    tro. I sine mange skrifter udforskede han troens natur, dens paradokser og dens
+    betydning for det individuelle liv. Han anså troen for at være et ”spring i det
+    forlommede”, en akt af vilje der overstiger fornuften. I værker som ”Frygt og
+    Trekken” og ”Sygdommen til Døden” analyserede han troens relation til angst, desperation
+    og den eksistentielle krise. Kierkegaards tanker om tro har haft stor indflydelse
+    på kristen teologi og eksistentialisme.
+pipeline_tag: feature-extraction
+library_name: sentence-transformers
+metrics:
+- dot_accuracy@1
+- dot_accuracy@3
+- dot_accuracy@5
+- dot_accuracy@10
+- dot_precision@1
+- dot_precision@3
+- dot_precision@5
+- dot_precision@10
+- dot_recall@1
+- dot_recall@3
+- dot_recall@5
+- dot_recall@10
+- dot_ndcg@10
+- dot_mrr@10
+- dot_map@100
+- query_active_dims
+- query_sparsity_ratio
+- corpus_active_dims
+- corpus_sparsity_ratio
+- avg_flops
+model-index:
+- name: Regular SPLADE NorBERT4-base — Retrieval-Only Training
+  results:
+  - task:
+      type: sparse-information-retrieval
+      name: Sparse Information Retrieval
+    dataset:
+      name: NanoNFCorpus
+      type: NanoNFCorpus
+    metrics:
+    - type: dot_accuracy@1
+      value: 0.02
+      name: Dot Accuracy@1
+    - type: dot_accuracy@3
+      value: 0.08
+      name: Dot Accuracy@3
+    - type: dot_accuracy@5
+      value: 0.08
+      name: Dot Accuracy@5
+    - type: dot_accuracy@10
+      value: 0.12
+      name: Dot Accuracy@10
+    - type: dot_precision@1
+      value: 0.02
+      name: Dot Precision@1
+    - type: dot_precision@3
+      value: 0.03333333333333333
+      name: Dot Precision@3
+    - type: dot_precision@5
+      value: 0.032
+      name: Dot Precision@5
+    - type: dot_precision@10
+      value: 0.026000000000000006
+      name: Dot Precision@10
+    - type: dot_recall@1
+      value: 7.905138339920947e-05
+      name: Dot Recall@1
+    - type: dot_recall@3
+      value: 0.003312410422185988
+      name: Dot Recall@3
+    - type: dot_recall@5
+      value: 0.004545769460972766
+      name: Dot Recall@5
+    - type: dot_recall@10
+      value: 0.006349071275176555
+      name: Dot Recall@10
+    - type: dot_ndcg@10
+      value: 0.027178706104522946
+      name: Dot Ndcg@10
+    - type: dot_mrr@10
+      value: 0.05088888888888889
+      name: Dot Mrr@10
+    - type: dot_map@100
+      value: 0.006747512755501429
+      name: Dot Map@100
+    - type: query_active_dims
+      value: 51200.0
+      name: Query Active Dims
+    - type: query_sparsity_ratio
+      value: 0.0
+      name: Query Sparsity Ratio
+    - type: corpus_active_dims
+      value: 51200.0
+      name: Corpus Active Dims
+    - type: corpus_sparsity_ratio
+      value: 0.0
+      name: Corpus Sparsity Ratio
+    - type: avg_flops
+      value: 51200.0
+      name: Avg Flops
+---
+# Regular SPLADE NorBERT4-base — Retrieval-Only Training
+This is a [SPLADE Sparse Encoder](https://www.sbert.net/docs/sparse_encoder/usage/usage.html) model finetuned from [ltg/norbert4-base](https://huggingface.co/ltg/norbert4-base) using the [sentence-transformers](https://www.SBERT.net) library. It maps sentences & paragraphs to a 51200-dimensional sparse vector space   and can be used for semantic search and sparse retrieval.
+## Model Details
+### Model Description
+- **Model Type:** SPLADE Sparse Encoder
+- **Base model:** [ltg/norbert4-base](https://huggingface.co/ltg/norbert4-base) <!-- at revision f04e0e824de9ff9a08767727dc8891d38fddd032 -->
+- **Maximum Sequence Length:** None tokens
+- **Output Dimensionality:** 51200 dimensions
+- **Similarity Function:** Dot Product
+<!-- - **Training Dataset:** Unknown -->
+- **Languages:** no, da, sv
+- **License:** mit
+### Model Sources
+- **Documentation:** [Sentence Transformers Documentation](https://sbert.net)
+- **Documentation:** [Sparse Encoder Documentation](https://www.sbert.net/docs/sparse_encoder/usage/usage.html)
+- **Repository:** [Sentence Transformers on GitHub](https://github.com/huggingface/sentence-transformers)
+- **Hugging Face:** [Sparse Encoders on Hugging Face](https://huggingface.co/models?library=sentence-transformers&other=sparse-encoder)
+### Full Model Architecture
+```
+SparseEncoder(
+  (0): MLMTransformer({'max_seq_length': None, 'do_lower_case': False, 'architecture': 'GptBertForMaskedLM'})
+  (1): SpladePooling({'pooling_strategy': 'max', 'activation_function': 'relu', 'word_embedding_dimension': 51200})
+)
+```
+## Usage
+### Direct Usage (Sentence Transformers)
+First install the Sentence Transformers library:
+```bash
+pip install -U sentence-transformers
+```
+Then you can load this model and run inference.
+```python
+from sentence_transformers import SparseEncoder
+# Download from the 🤗 Hub
+model = SparseEncoder("thivy/norbert4-base-splade-retrieval")
+# Run inference
+sentences = [
+    '\n    \nJeg vil ikke ha noen innvendinger.\n    \n  ',
+    '\n    \nJeg ville ikke have nogen indvendinger.\n    \n  ',
+    'Søren Kierkegaard, den danske filosof og teolog, var dybt fascineret af begrebet tro. I sine mange skrifter udforskede han troens natur, dens paradokser og dens betydning for det individuelle liv. Han anså troen for at være et ”spring i det forlommede”, en akt af vilje der overstiger fornuften. I værker som ”Frygt og Trekken” og ”Sygdommen til Døden” analyserede han troens relation til angst, desperation og den eksistentielle krise. Kierkegaards tanker om tro har haft stor indflydelse på kristen teologi og eksistentialisme.',
+]
+embeddings = model.encode(sentences)
+print(embeddings.shape)
+# [3, 51200]
+# Get the similarity scores for the embeddings
+similarities = model.similarity(embeddings, embeddings)
+print(similarities)
+# tensor([[ 8.0400,  6.6640,  6.9193],
+#         [ 6.6640, 10.4033,  9.1223],
+#         [ 6.9193,  9.1223, 20.8932]])
+```
+<!--
+### Direct Usage (Transformers)
+<details><summary>Click to see the direct usage in Transformers</summary>
+</details>
+-->
+<!--
+### Downstream Usage (Sentence Transformers)
+You can finetune this model on your own dataset.
+<details><summary>Click to expand</summary>
+</details>
+-->
+<!--
+### Out-of-Scope Use
+*List how the model may foreseeably be misused and address what users ought not to do with the model.*
+-->
+## Evaluation
+### Metrics
+#### Sparse Information Retrieval
+* Dataset: `NanoNFCorpus`
+* Evaluated with [<code>SparseInformationRetrievalEvaluator</code>](https://sbert.net/docs/package_reference/sparse_encoder/evaluation.html#sentence_transformers.sparse_encoder.evaluation.SparseInformationRetrievalEvaluator)
+| Metric                | Value      |
+|:----------------------|:-----------|
+| dot_accuracy@1        | 0.02       |
+| dot_accuracy@3        | 0.08       |
+| dot_accuracy@5        | 0.08       |
+| dot_accuracy@10       | 0.12       |
+| dot_precision@1       | 0.02       |
+| dot_precision@3       | 0.0333     |
+| dot_precision@5       | 0.032      |
+| dot_precision@10      | 0.026      |
+| dot_recall@1          | 0.0001     |
+| dot_recall@3          | 0.0033     |
+| dot_recall@5          | 0.0045     |
+| dot_recall@10         | 0.0063     |
+| **dot_ndcg@10**       | **0.0272** |
+| dot_mrr@10            | 0.0509     |
+| dot_map@100           | 0.0067     |
+| query_active_dims     | 51200.0    |
+| query_sparsity_ratio  | 0.0        |
+| corpus_active_dims    | 51200.0    |
+| corpus_sparsity_ratio | 0.0        |
+| avg_flops             | 51200.0    |
+<!--
+## Bias, Risks and Limitations
+*What are the known or foreseeable issues stemming from this model? You could also flag here known failure cases or weaknesses of the model.*
+-->
+<!--
+### Recommendations
+*What are recommendations with respect to the foreseeable issues? For example, filtering explicit content.*
+-->
+## Training Details
+### Training Dataset
+#### Unnamed Dataset
+* Size: 333,547 training samples
+* Columns: <code>anchor</code> and <code>positive</code>
+* Approximate statistics based on the first 1000 samples:
+  |         | anchor                                                                             | positive                                                                             |
+  |:--------|:-----------------------------------------------------------------------------------|:-------------------------------------------------------------------------------------|
+  | type    | string                                                                             | string                                                                               |
+  | details | <ul><li>min: 3 tokens</li><li>mean: 22.81 tokens</li><li>max: 517 tokens</li></ul> | <ul><li>min: 1 tokens</li><li>mean: 406.29 tokens</li><li>max: 4096 tokens</li></ul> |
+* Samples:
+  | anchor                                                               | positive                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
+  |:---------------------------------------------------------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+  | <code><br>Hun er mye eldre enn henne.<br>    <br>  </code>           | <code><br>Hun er meget ældre end hende.<br>    <br>  </code>                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
+  | <code><br>Hva så? <br>   <br>Du lå med kona mi!<br>    <br>  </code> | <code><br>Men du gik i seng med min kone.<br>    <br>  </code>                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
+  | <code>Hur aktiverar jag en indeksfond?</code>                        | <code>Att investera i indexfonder är ett populärt sätt att exponera sig mot aktiemarknaden. Det är ett passivt investeringsalternativ där portföljen följer en specifik index, till exempel OMX Stockholm 30.<br><br>För att aktivera en indexfond behöver du ett depåkonto hos en bank eller en investmentsmäklare. Innan du påbörjar processen bör du noggrant undersöka och jämföra olika fonder för att hitta den som bäst passar dina investeringsmål och risktolerans.<br><br>När du väl har valt en fond kan du vanligtvis aktivera den online via bankens eller mäklarens plattform. Du behöver ange hur mycket du vill investera och godkänna villkoren. Därefter kommer fonden att köpas och lagts till i ditt depåkonto.<br><br>Det är viktigt att ha en långsiktig investeringshorisont när du investerar i indexfonder. Marknaderna fluktuerar i värde på kort sikt, men över tid har indexfonder historiskt sett genererat goda avkastningar.</code> |
+* Loss: [<code>SpladeLoss</code>](https://sbert.net/docs/package_reference/sparse_encoder/losses.html#spladeloss) with these parameters:
+  ```json
+  {
+      "loss": "SparseMultipleNegativesRankingLoss(scale=1.0, similarity_fct='dot_score', gather_across_devices=False)",
+      "document_regularizer_weight": 0.003,
+      "query_regularizer_weight": 0.0001
+  }
+  ```
+### Evaluation Dataset
+#### Unnamed Dataset
+* Size: 14,458 evaluation samples
+* Columns: <code>anchor</code> and <code>positive</code>
+* Approximate statistics based on the first 1000 samples:
+  |         | anchor                                                                            | positive                                                                             |
+  |:--------|:----------------------------------------------------------------------------------|:-------------------------------------------------------------------------------------|
+  | type    | string                                                                            | string                                                                               |
+  | details | <ul><li>min: 3 tokens</li><li>mean: 16.03 tokens</li><li>max: 86 tokens</li></ul> | <ul><li>min: 7 tokens</li><li>mean: 134.75 tokens</li><li>max: 4096 tokens</li></ul> |
+* Samples:
+  | anchor                                                                                      | positive                                                                            |
+  |:--------------------------------------------------------------------------------------------|:------------------------------------------------------------------------------------|
+  | <code><br>    <br>Hva er det for organisasjon som skal ha årsmøte her?<br>    <br>  </code> | <code><br>    <br>Hvilken organisation skal holde kongres her?<br>    <br>  </code> |
+  | <code><br>Livet ditt er jo ikke så verst.<br>    <br>  </code>                              | <code><br>Dit liv er ikke så slemt.<br>    <br>  </code>                            |
+  | <code><br>    <br>Men du må ta deg av dem for meg, okay?<br>    <br>  </code>               | <code><br>    <br>Men du må tage dig af dem for mig, okay?<br>    <br>  </code>     |
+* Loss: [<code>SpladeLoss</code>](https://sbert.net/docs/package_reference/sparse_encoder/losses.html#spladeloss) with these parameters:
+  ```json
+  {
+      "loss": "SparseMultipleNegativesRankingLoss(scale=1.0, similarity_fct='dot_score', gather_across_devices=False)",
+      "document_regularizer_weight": 0.003,
+      "query_regularizer_weight": 0.0001
+  }
+  ```
+### Training Hyperparameters
+#### Non-Default Hyperparameters
+- `eval_strategy`: steps
+- `per_device_train_batch_size`: 16
+- `per_device_eval_batch_size`: 32
+- `learning_rate`: 2e-05
+- `weight_decay`: 0.01
+- `num_train_epochs`: 1
+- `warmup_ratio`: 0.1
+- `bf16`: True
+- `dataloader_num_workers`: 2
+- `dataloader_prefetch_factor`: 2
+- `load_best_model_at_end`: True
+- `ddp_find_unused_parameters`: True
+- `push_to_hub`: True
+- `hub_model_id`: thivy/norbert4-base-splade-retrieval
+- `hub_strategy`: checkpoint
+- `hub_private_repo`: False
+- `gradient_checkpointing`: True
+- `gradient_checkpointing_kwargs`: {'use_reentrant': False}
+- `multi_dataset_batch_sampler`: round_robin
+#### All Hyperparameters
+<details><summary>Click to expand</summary>
+- `overwrite_output_dir`: False
+- `do_predict`: False
+- `eval_strategy`: steps
+- `prediction_loss_only`: True
+- `per_device_train_batch_size`: 16
+- `per_device_eval_batch_size`: 32
+- `per_gpu_train_batch_size`: None
+- `per_gpu_eval_batch_size`: None
+- `gradient_accumulation_steps`: 1
+- `eval_accumulation_steps`: None
+- `torch_empty_cache_steps`: None
+- `learning_rate`: 2e-05
+- `weight_decay`: 0.01
+- `adam_beta1`: 0.9
+- `adam_beta2`: 0.999
+- `adam_epsilon`: 1e-08
+- `max_grad_norm`: 1.0
+- `num_train_epochs`: 1
+- `max_steps`: -1
+- `lr_scheduler_type`: linear
+- `lr_scheduler_kwargs`: {}
+- `warmup_ratio`: 0.1
+- `warmup_steps`: 0
+- `log_level`: passive
+- `log_level_replica`: warning
+- `log_on_each_node`: True
+- `logging_nan_inf_filter`: True
+- `save_safetensors`: True
+- `save_on_each_node`: False
+- `save_only_model`: False
+- `restore_callback_states_from_checkpoint`: False
+- `no_cuda`: False
+- `use_cpu`: False
+- `use_mps_device`: False
+- `seed`: 42
+- `data_seed`: None
+- `jit_mode_eval`: False
+- `bf16`: True
+- `fp16`: False
+- `fp16_opt_level`: O1
+- `half_precision_backend`: auto
+- `bf16_full_eval`: False
+- `fp16_full_eval`: False
+- `tf32`: None
+- `local_rank`: 0
+- `ddp_backend`: None
+- `tpu_num_cores`: None
+- `tpu_metrics_debug`: False
+- `debug`: []
+- `dataloader_drop_last`: True
+- `dataloader_num_workers`: 2
+- `dataloader_prefetch_factor`: 2
+- `past_index`: -1
+- `disable_tqdm`: False
+- `remove_unused_columns`: True
+- `label_names`: None
+- `load_best_model_at_end`: True
+- `ignore_data_skip`: False
+- `fsdp`: []
+- `fsdp_min_num_params`: 0
+- `fsdp_config`: {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}
+- `fsdp_transformer_layer_cls_to_wrap`: None
+- `accelerator_config`: {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}
+- `parallelism_config`: None
+- `deepspeed`: None
+- `label_smoothing_factor`: 0.0
+- `optim`: adamw_torch_fused
+- `optim_args`: None
+- `adafactor`: False
+- `group_by_length`: False
+- `length_column_name`: length
+- `project`: huggingface
+- `trackio_space_id`: trackio
+- `ddp_find_unused_parameters`: True
+- `ddp_bucket_cap_mb`: None
+- `ddp_broadcast_buffers`: False
+- `dataloader_pin_memory`: True
+- `dataloader_persistent_workers`: False
+- `skip_memory_metrics`: True
+- `use_legacy_prediction_loop`: False
+- `push_to_hub`: True
+- `resume_from_checkpoint`: None
+- `hub_model_id`: thivy/norbert4-base-splade-retrieval
+- `hub_strategy`: checkpoint
+- `hub_private_repo`: False
+- `hub_always_push`: False
+- `hub_revision`: None
+- `gradient_checkpointing`: True
+- `gradient_checkpointing_kwargs`: {'use_reentrant': False}
+- `include_inputs_for_metrics`: False
+- `include_for_metrics`: []
+- `eval_do_concat_batches`: True
+- `fp16_backend`: auto
+- `push_to_hub_model_id`: None
+- `push_to_hub_organization`: None
+- `mp_parameters`:
+- `auto_find_batch_size`: False
+- `full_determinism`: False
+- `torchdynamo`: None
+- `ray_scope`: last
+- `ddp_timeout`: 1800
+- `torch_compile`: False
+- `torch_compile_backend`: None
+- `torch_compile_mode`: None
+- `include_tokens_per_second`: False
+- `include_num_input_tokens_seen`: no
+- `neftune_noise_alpha`: None
+- `optim_target_modules`: None
+- `batch_eval_metrics`: False
+- `eval_on_start`: False
+- `use_liger_kernel`: False
+- `liger_kernel_config`: None
+- `eval_use_gather_object`: False
+- `average_tokens_across_devices`: True
+- `prompts`: None
+- `batch_sampler`: batch_sampler
+- `multi_dataset_batch_sampler`: round_robin
+- `router_mapping`: {}
+- `learning_rate_mapping`: {}
+</details>
+### Training Logs
+| Epoch  | Step | Training Loss | Validation Loss | NanoNFCorpus_dot_ndcg@10 |
+|:------:|:----:|:-------------:|:---------------:|:------------------------:|
+| 0.0048 | 50   | 37895.69      | -               | -                        |
+| 0.0096 | 100  | 10002.0562    | -               | -                        |
+| 0.0144 | 150  | 3805.4731     | -               | -                        |
+| 0.0192 | 200  | 923.0944      | -               | -                        |
+| 0.0240 | 250  | 514.7795      | -               | -                        |
+| 0.0288 | 300  | 284.5449      | -               | -                        |
+| 0.0336 | 350  | 90.0678       | -               | -                        |
+| 0.0384 | 400  | 30.8482       | -               | -                        |
+| 0.0432 | 450  | 2.5071        | -               | -                        |
+| 0.0480 | 500  | 1.3525        | 2.2663          | 0.0272                   |
+### Framework Versions
+- Python: 3.12.12
+- Sentence Transformers: 5.2.0
+- Transformers: 4.57.3
+- PyTorch: 2.9.1+cu128
+- Accelerate: 1.12.0
+- Datasets: 4.4.2
+- Tokenizers: 0.22.2
+## Citation
+### BibTeX
+#### Sentence Transformers
+```bibtex
+@inproceedings{reimers-2019-sentence-bert,
+    title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
+    author = "Reimers, Nils and Gurevych, Iryna",
+    booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
+    month = "11",
+    year = "2019",
+    publisher = "Association for Computational Linguistics",
+    url = "https://arxiv.org/abs/1908.10084",
+}
+```
+#### SpladeLoss
+```bibtex
+@misc{formal2022distillationhardnegativesampling,
+      title={From Distillation to Hard Negative Sampling: Making Sparse Neural IR Models More Effective},
+      author={Thibault Formal and Carlos Lassance and Benjamin Piwowarski and Stéphane Clinchant},
+      year={2022},
+      eprint={2205.04733},
+      archivePrefix={arXiv},
+      primaryClass={cs.IR},
+      url={https://arxiv.org/abs/2205.04733},
+}
+```
+#### SparseMultipleNegativesRankingLoss
+```bibtex
+@misc{henderson2017efficient,
+    title={Efficient Natural Language Response Suggestion for Smart Reply},
+    author={Matthew Henderson and Rami Al-Rfou and Brian Strope and Yun-hsuan Sung and Laszlo Lukacs and Ruiqi Guo and Sanjiv Kumar and Balint Miklos and Ray Kurzweil},
+    year={2017},
+    eprint={1705.00652},
+    archivePrefix={arXiv},
+    primaryClass={cs.CL}
+}
+```
+#### FlopsLoss
+```bibtex
+@article{paria2020minimizing,
+    title={Minimizing flops to learn efficient sparse representations},
+    author={Paria, Biswajit and Yeh, Chih-Kuan and Yen, Ian EH and Xu, Ning and Ravikumar, Pradeep and P{'o}czos, Barnab{'a}s},
+    journal={arXiv preprint arXiv:2004.05665},
+    year={2020}
+}
+```
+<!--
+## Glossary
+*Clearly define terms in order to be accessible across audiences.*
+-->
+<!--
+## Model Card Authors
+*Lists the people who create the model card, providing recognition and accountability for the detailed work that goes into its construction.*
+-->
+<!--
+## Model Card Contact
+*Provides a way for people who have updates to the Model Card, suggestions, or questions, to contact the Model Card authors.*
+-->

last-checkpoint/config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "architectures": [
+    "GptBertForMaskedLM"
+  ],
+  "attention_dropout": 0.0,
+  "attn_implementation": null,
+  "auto_map": {
+    "AutoConfig": "configuration_gptbert.GptBertConfig",
+    "AutoModel": "modeling_gptbert.GptBertModel",
+    "AutoModelForCausalLM": "modeling_gptbert.GptBertForCausalLM",
+    "AutoModelForMaskedLM": "modeling_gptbert.GptBertForMaskedLM",
+    "AutoModelForMultipleChoice": "modeling_gptbert.GptBertForMultipleChoice",
+    "AutoModelForQuestionAnswering": "modeling_gptbert.GptBertForQuestionAnswering",
+    "AutoModelForSequenceClassification": "modeling_gptbert.GptBertForSequenceClassification",
+    "AutoModelForTokenClassification": "modeling_gptbert.GptBertForTokenClassification"
+  },
+  "bos_token_id": 1,
+  "classifier_dropout": 0.2,
+  "deterministic_flash_attn": false,
+  "dtype": "float32",
+  "embedding_dropout": 0.1,
+  "eos_token_id": 2,
+  "global_window_length": 8192,
+  "hidden_dropout": 0.0,
+  "hidden_size": 640,
+  "intermediate_size": 1664,
+  "layer_norm_eps": 1e-07,
+  "local_global_ratio": 4,
+  "local_window_length": 256,
+  "mask_token_id": 4,
+  "max_sequence_length": 16384,
+  "model": "norbert4",
+  "num_attention_heads": 10,
+  "num_layers": 24,
+  "pad_token_id": 3,
+  "query_key_head_size": 64,
+  "rope_theta": 160000,
+  "transformers_version": "4.57.3",
+  "unk_token_id": 0,
+  "use_cache": false,
+  "value_head_size": 64,
+  "vocab_size": 51200
+}

last-checkpoint/config_sentence_transformers.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "model_type": "SparseEncoder",
+  "__version__": {
+    "sentence_transformers": "5.2.0",
+    "transformers": "4.57.3",
+    "pytorch": "2.9.1+cu128"
+  },
+  "prompts": {
+    "query": "",
+    "document": ""
+  },
+  "default_prompt_name": null,
+  "similarity_fn_name": "dot"
+}

last-checkpoint/configuration_gptbert.py ADDED Viewed

	@@ -0,0 +1,34 @@

+from __future__ import annotations
+import json
+from pathlib import Path
+import copy
+from transformers.configuration_utils import PretrainedConfig
+class GptBertConfig(PretrainedConfig):
+    def __init__(
+        self,
+        config_file: Path | str | None = None,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.model = "norbert4"
+        if config_file is not None:
+            if type(config_file) is str:
+                config_file = Path(config_file)
+            assert type(config_file) is not Path, "The config_file should either be a Path or str"
+            with config_file.open("r") as file:
+                config = json.load(file)
+            for attr, value in config.items():
+                if isinstance(value, str):
+                    value = value.lower()
+                setattr(self, attr, value)
+        for attr, value in kwargs.items():
+            if isinstance(value, str):
+                value = value.lower()
+            setattr(self, attr, value)

last-checkpoint/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ca6b59b6342fcd6a1910b237e5db7707f98673239940cfc25a5d1876082ebc33
+size 728561776

last-checkpoint/modeling_gptbert.py ADDED Viewed

	@@ -0,0 +1,1105 @@

+from __future__ import annotations
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from torch import _softmax_backward_data as _softmax_backward_data
+from functools import partial, lru_cache
+from .configuration_gptbert import GptBertConfig
+from transformers.modeling_utils import PreTrainedModel
+from transformers.activations import gelu_new
+from transformers.utils import is_flash_attn_2_available, logging
+from transformers.modeling_outputs import (
+    MaskedLMOutput,
+    MultipleChoiceModelOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+    BaseModelOutput,
+    CausalLMOutput
+)
+import math
+from typing import TYPE_CHECKING, Optional, Union, Tuple, List
+logger = logging.get_logger(__name__)
+# Workaround for transformers < 4.36.0 check_imports issue
+# See: https://github.com/huggingface/transformers/issues/28459
+try:
+    if is_flash_attn_2_available():
+        from flash_attn.flash_attn_interface import flash_attn_varlen_qkvpacked_func
+        from flash_attn.layers.rotary import RotaryEmbedding
+        from flash_attn.ops.triton.rotary import apply_rotary
+    else:
+        flash_attn_varlen_qkvpacked_func, RotaryEmbedding, apply_rotary = None, object, None
+        logger.warning_once(
+            "NorBERT4 støtter FlashAttention, men det er ikke funnet i miljøet ditt. Du bør vurdere å oppdatere miljøet ditt for å få raskere og mindre minnekrevende behandling."
+        )
+except ImportError:
+    flash_attn_varlen_qkvpacked_func, RotaryEmbedding, apply_rotary = None, object, None
+    logger.warning_once(
+        "NorBERT4 støtter FlashAttention, men det er ikke funnet i miljøet ditt. Du bør vurdere å oppdatere miljøet ditt for å få raskere og mindre minnekrevende behandling."
+    )
+# from https://github.com/huggingface/transformers/blob/main/src/transformers/models/modernbert/modeling_modernbert.py
+@torch.compiler.disable()
+def _unpad_input(input_ids: torch.Tensor, attention_mask: torch.Tensor):
+    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = int(seqlens_in_batch.max().item())
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
+    if input_ids.dim() == 2:
+        unpadded_inputs = input_ids.flatten()[indices]
+    else:
+        batch_size, sequence_length, *rest = input_ids.shape
+        shape = batch_size * sequence_length
+        unpadded_inputs = input_ids.view(shape, *rest)[indices]
+    return unpadded_inputs, indices, cu_seqlens, max_seqlen_in_batch
+# from https://github.com/huggingface/transformers/blob/main/src/transformers/models/modernbert/modeling_modernbert.py
+def _pad_output(input_ids: torch.Tensor, indices: torch.Tensor, batch_size: int, sequence_length: int) -> torch.Tensor:
+    if input_ids.dim() == 1:
+        output = torch.zeros(batch_size * sequence_length, dtype=input_ids.dtype, device=input_ids.device)
+        output[indices] = input_ids
+        padded_inputs = output.view(batch_size, sequence_length)
+    else:
+        _, *rest = input_ids.shape
+        output = torch.zeros(batch_size * sequence_length, *rest, dtype=input_ids.dtype, device=input_ids.device)
+        output[indices] = input_ids
+        padded_inputs = output.view(batch_size, sequence_length, *rest)
+    return padded_inputs
+class CastedLinear(nn.Linear):
+    def __init__(self, in_features, out_features, bias):
+        super().__init__(in_features, out_features, bias=bias)
+    def forward(self, x):
+        return F.linear(x, self.weight.type_as(x), bias=self.bias.type_as(x) if self.bias is not None else None)
+class CastedLinearIn(nn.Linear):
+    def __init__(self, in_features, out_features, bias):
+        super().__init__(in_features, out_features, bias=bias)
+        self.scale = nn.Parameter(torch.ones(in_features))
+    def forward(self, x):
+        return F.linear(x, (self.weight * (self.scale + 1.0).unsqueeze(0)).type_as(x), bias=self.bias.type_as(x) if self.bias is not None else None)
+class MultiCastedLinearOrthoIn(nn.Module):
+    def __init__(self, in_features, out_features, bias):
+        super().__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        self.weights = nn.ParameterList()
+        for out_feature in out_features:
+            self.weights.append(nn.Parameter(torch.empty((out_feature, in_features))))
+        if bias:
+            self.bias = nn.Parameter(torch.zeros(sum(out_features)))
+        else:
+            self.bias = self.register_parameter("bias", None)
+        self.scale = nn.Parameter(torch.ones(in_features))
+    def forward(self, x):
+        return F.linear(x, (torch.cat([weight for weight in self.weights], dim=0) * (self.scale + 1.0).unsqueeze(0)).type_as(x), bias=self.bias.type_as(x) if self.bias is not None else None)
+class GeGLU(nn.Module):
+    def forward(self, x):
+        x, gate = x.chunk(2, dim=-1)
+        return x * gelu_new(gate)
+class Embedding(nn.Module):
+    def __init__(self, config: GptBertConfig):
+        super().__init__()
+        self.word_embedding = nn.Embedding(config.vocab_size, config.hidden_size)
+        self.word_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps, elementwise_affine=False, bias=False)
+        self.word_scale = nn.Parameter(torch.zeros(config.hidden_size))
+        self.dropout = nn.Dropout(config.embedding_dropout)
+    def forward(self, input_ids: torch.Tensor):
+        word_embedding = self.word_embedding(input_ids)
+        word_embedding = self.word_norm(word_embedding)
+        word_embedding = word_embedding * (self.word_scale + 1.0)
+        return self.dropout(word_embedding)
+class LMClassifier(nn.Module):
+    def __init__(self, config: GptBertConfig, n_labels: int):
+        super().__init__()
+        self.pre_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps, elementwise_affine=False)
+        self.projection = CastedLinearIn(config.hidden_size, config.hidden_size, bias=False)
+        self.post_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps, elementwise_affine=False)
+        self.emb2vocab = CastedLinearIn(config.hidden_size, n_labels, bias=True)
+    def forward(self, x: torch.Tensor):
+        x = self.pre_norm(x.float()).type_as(x)
+        x = self.projection(x)
+        x = gelu_new(x)
+        x = self.post_norm(x.float()).type_as(x)
+        x = self.emb2vocab(x)
+        return x
+class Classifier(nn.Module):
+    def __init__(self, config: GptBertConfig, n_labels: int):
+        super().__init__()
+        self.pre_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps, elementwise_affine=False)
+        self.projection = CastedLinearIn(config.hidden_size, config.hidden_size, bias=False)
+        self.post_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps, elementwise_affine=False)
+        self.dropout = nn.Dropout(config.classifier_dropout)
+        self.output_projection = CastedLinearIn(config.hidden_size, n_labels, bias=True)
+    def forward(self, x: torch.Tensor):
+        x = self.pre_norm(x.float()).type_as(x)
+        x = self.projection(x)
+        x = gelu_new(x)
+        x = self.post_norm(x.float()).type_as(x)
+        x = self.dropout(x)
+        x = self.output_projection(x)
+        return x
+# from https://github.com/huggingface/transformers/blob/main/src/transformers/models/modernbert/modeling_modernbert.py
+def flash_attention_forward(qkv: torch.Tensor, rotary_emb: UnpaddedRotaryEmbedding, cu_seqlens: torch.Tensor, max_seqlen: int, causal: bool, local_attention: Tuple[int, int], dropout_p: float, deterministic: bool, target_dtype: torch.dtype = torch.bfloat16, **_kwargs):
+    qkv = rotary_emb(qkv, cu_seqlens=cu_seqlens, max_seqlen=max_seqlen)
+    convert_dtype = qkv.dtype not in (torch.float16, torch.bfloat16)
+    if convert_dtype:
+        # FA2 implementation only supports fp16 and bf16. If FA2 is supported,
+        # bfloat16 must be supported as of FA2 2.5.7. (Turing GPUs not supported)
+        orig_dtype = qkv.dtype
+        qkv = qkv.to(target_dtype)
+        attn = flash_attn_varlen_qkvpacked_func(
+            qkv,
+            cu_seqlens=cu_seqlens,
+            max_seqlen=max_seqlen,
+            dropout_p=dropout_p,
+            deterministic=deterministic,
+            window_size=local_attention,
+            causal=False
+        )
+        attn = attn.to(orig_dtype)  # type: ignore
+    else:
+        attn = flash_attn_varlen_qkvpacked_func(
+            qkv,
+            cu_seqlens=cu_seqlens,
+            max_seqlen=max_seqlen,
+            dropout_p=dropout_p,
+            deterministic=deterministic,
+            window_size=local_attention,
+            causal=False
+        )
+    return attn
+# from https://github.com/huggingface/transformers/blob/main/src/transformers/models/modernbert/modeling_modernbert.py
+class ApplyRotaryEmbUnpad(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, qkv, cos, sin, cu_seqlens: Optional[torch.Tensor] = None, max_seqlen: Optional[int] = None):
+        # (total_nnz, 3, nheads, headdim)
+        qkv = qkv.contiguous()
+        total_nnz, _three, _nheads, headdim = qkv.shape
+        # We need qkv to be contiguous so that when we reshape to combine (3, nheads) dimensions,
+        # we get the same tensor
+        # qk = rearrange(qkv[:, :2], "b_s t h d -> b_s (t h) d")
+        qk = qkv[:, :2].view(total_nnz, -1, headdim)
+        apply_rotary(qk, cos, sin, seqlen_offsets=0, cu_seqlens=cu_seqlens, max_seqlen=max_seqlen, interleaved=False, inplace=True)
+        ctx.save_for_backward(cos, sin, cu_seqlens)
+        ctx.max_seqlen = max_seqlen
+        return qkv
+    @staticmethod
+    def backward(ctx, do):
+        cos, sin, cu_seqlens = ctx.saved_tensors
+        do = do.contiguous()
+        total_nnz, _three, _nheads, headdim = do.shape
+        # We need dqkv to be contiguous so that when we reshape to combine (3, nheads) dimensions,
+        # we get the same tensor
+        dqk = do[:, :2].view(total_nnz, -1, headdim)
+        apply_rotary(
+            dqk,
+            cos,
+            sin,
+            seqlen_offsets=0,
+            cu_seqlens=cu_seqlens,
+            max_seqlen=ctx.max_seqlen,
+            interleaved=False,
+            inplace=True,
+            conjugate=True,
+        )
+        return do, None, None, None, None, None, None
+# from https://github.com/huggingface/transformers/blob/main/src/transformers/models/modernbert/modeling_modernbert.py
+def apply_rotary_unpadded(qkv, cos, sin, cu_seqlens: Optional[torch.Tensor] = None, max_seqlen: Optional[int] = None):
+    return ApplyRotaryEmbUnpad.apply(qkv, cos, sin, cu_seqlens, max_seqlen)
+# from https://github.com/huggingface/transformers/blob/main/src/transformers/models/modernbert/modeling_modernbert.py
+class UnpaddedRotaryEmbedding(RotaryEmbedding):
+    def __init__(self, dim: int, base: float = 10000.0, max_seqlen: Optional[int] = None):
+        super().__init__(dim=dim, base=base, device=None, interleaved=False)
+        self.max_seqlen = max_seqlen
+    def forward(self, qkv: torch.Tensor, cu_seqlens: torch.Tensor, max_seqlen: Optional[int] = None) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        if max_seqlen is not None:
+            self._update_cos_sin_cache(max_seqlen, device=qkv.device, dtype=qkv.dtype)
+        qkv = apply_rotary_unpadded(
+            qkv,
+            self._cos_cached,
+            self._sin_cached,
+            cu_seqlens=cu_seqlens,
+            max_seqlen=max_seqlen,
+        )
+        return qkv
+class RotaryPositionalEmbeddings(nn.Module):
+    def __init__(self, config, theta: int):
+        super().__init__()
+        head_size = config.query_key_head_size
+        assert head_size % 2 == 0
+        max_seq_len = config.max_sequence_length
+        inv_freq = 1.0 / (theta ** (torch.arange(0, head_size, 2, dtype=torch.float32) / head_size))
+        pos = torch.arange(max_seq_len, dtype=torch.float32)
+        embedding = torch.einsum('n, d -> nd', pos, inv_freq)
+        embedding = torch.cat([embedding, embedding], dim=-1).unsqueeze(0)
+        self.register_buffer("cos_matrix", embedding.cos(), persistent=False)
+        self.register_buffer("sin_matrix", embedding.sin(), persistent=False)
+    def forward(self, x: torch.Tensor):
+        hidden_layer = x.float()
+        seq_len = x.shape[2]
+        cos_matrix = self.cos_matrix[:, None, :seq_len, :]
+        sin_matrix = self.sin_matrix[:, None, :seq_len, :]
+        x_rotate_half = torch.cat(
+            [
+                -hidden_layer[:, :, :, x.size(-1) // 2:],
+                hidden_layer[:, :, :, :x.size(-1) // 2]
+            ],
+            dim=-1
+        )
+        out = hidden_layer * cos_matrix + x_rotate_half * sin_matrix
+        return out.type_as(x)
+class MaskedSoftmax(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x: torch.Tensor, mask: torch.BoolTensor, dim: int) -> torch.Tensor:
+        ctx.dim = dim
+        x.masked_fill_(mask, float('-inf'))
+        x = torch.softmax(x, ctx.dim)
+        x.masked_fill_(mask, 0.0)
+        ctx.save_for_backward(x)
+        return x
+    @staticmethod
+    def backward(ctx, grad_output: torch.Tensor) -> tuple[torch.Tensor, None, None]:
+        output: torch.Tensor
+        output, = ctx.saved_tensors
+        inputGrad: torch.Tensor = _softmax_backward_data(grad_output, output, ctx.dim, output.dtype)
+        return inputGrad, None, None
+class SelfAttention(nn.Module):
+    def __init__(self, config: GptBertConfig, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.d_qk = config.query_key_head_size
+        self.d_v = config.value_head_size
+        self.num_attention_heads = config.num_attention_heads
+        self.num_kv_heads = config.num_attention_heads
+        self.hidden_size = config.hidden_size
+        self.q_out_dim = self.d_qk * self.num_attention_heads
+        self.k_out_dim = self.d_qk * self.num_kv_heads
+        self.v_out_dim = self.d_v * self.num_kv_heads
+        self.qk_proj = MultiCastedLinearOrthoIn(self.hidden_size, [self.q_out_dim, self.k_out_dim], bias=False)
+        self.v_proj = CastedLinearIn(self.hidden_size, self.v_out_dim, bias=False)
+        self.out_proj = CastedLinearIn(self.d_v*self.num_attention_heads, self.hidden_size, bias=False)
+        self.pre_v_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps, elementwise_affine=False)
+        self.pre_qk_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps, elementwise_affine=False)
+        self.inter_norm = nn.LayerNorm(self.d_v * self.num_attention_heads, eps=config.layer_norm_eps, elementwise_affine=False)
+        self.q_norm = nn.LayerNorm(self.d_qk, eps=config.layer_norm_eps, elementwise_affine=False, bias=False)
+        self.k_norm = nn.LayerNorm(self.d_qk, eps=config.layer_norm_eps, elementwise_affine=False, bias=False)
+        self.k_scale = nn.Parameter(torch.ones(self.num_kv_heads, self.d_qk))
+        self.q_scale = nn.Parameter(torch.ones(self.num_attention_heads, self.d_qk))
+        self.attention_dropout = nn.Dropout(config.attention_dropout)
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        theta = 160_000 if (layer_idx + 1) % config.local_global_ratio == 0 else 10_000
+        # Initialize rotary embeddings based on whether FlashAttention is available
+        if flash_attn_varlen_qkvpacked_func is not None:
+            self.rope_embedding = UnpaddedRotaryEmbedding(dim=self.d_qk, base=theta, max_seqlen=config.max_sequence_length)
+        else:
+            self.rope_embedding = RotaryPositionalEmbeddings(config, theta)
+        self.scale = 1.0 / math.sqrt(self.d_qk)
+        self.lambdas = nn.Parameter(torch.tensor([0.5]))
+        self.sequence_length = config.max_sequence_length
+        self.is_causal = config.is_decoder
+        self.window_length = None
+    def set_window_length(self, window_length: int):
+        self.window_length = window_length
+    def _get_window_mask(self, query_length: int, key_length: int, device: torch.device):
+        """Create and cache window attention mask."""
+        if self.is_causal:
+            mask = torch.ones(query_length, key_length, dtype=torch.bool, device=device)
+            mask = mask.tril().triu(diagonal=-self.window_length)
+        else:
+            mask = torch.ones(query_length, key_length, dtype=torch.bool, device=device)
+            mask = mask.tril(diagonal=self.window_length).triu(diagonal=-self.window_length)
+        return mask.view(1, 1, query_length, key_length)
+    def attention_operation(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, padding_mask: Optional[torch.Tensor]) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Standard attention computation with masking."""
+        batch_size, _, query_length, _ = query.size()
+        _, _, key_length, _ = key.size()
+        # Use cached window mask
+        with torch.no_grad():
+            window_mask = self._get_window_mask(query_length, key_length, query.device)
+            if padding_mask is not None:
+                attention_mask = padding_mask & window_mask
+            else:
+                attention_mask = window_mask
+        attention_scores = torch.bmm(query.flatten(0, 1), key.transpose(-1, -2).flatten(0, 1)) * self.scale  # shape: [B*H, Q_T, K_T]
+        attention_scores = attention_scores.view(batch_size, self.num_attention_heads, query_length, key_length)
+        attention_probabilities = MaskedSoftmax.apply(attention_scores, ~attention_mask, -1)
+        attention_probabilities = self.attention_dropout(attention_probabilities)
+        output = torch.bmm(attention_probabilities.flatten(0, 1), value.flatten(0, 1))
+        output = output.view(batch_size, self.num_attention_heads, query_length, self.d_v)
+        return output
+    def forward(self, hidden_layer: torch.Tensor, qk_layer: torch.Tensor, v1: torch.Tensor | None, padding_info):
+        # Get original shape info
+        if flash_attn_varlen_qkvpacked_func is not None:
+            # Unpadded case
+            indices, cu_seqlens, max_seqlen = padding_info
+            total_seqlen = hidden_layer.size(0)
+            batch_size = cu_seqlens.size(0) - 1
+        else:
+            # Padded case
+            batch_size, seq_length = hidden_layer.size(0), hidden_layer.size(1)
+        hidden_layer = self.pre_v_norm(hidden_layer.float()).type_as(hidden_layer)
+        qk_layer = self.pre_qk_norm(qk_layer.float()).type_as(qk_layer)
+        query, key = self.qk_proj(qk_layer).tensor_split([self.q_out_dim], dim=-1)
+        value = self.v_proj(hidden_layer)
+        if flash_attn_varlen_qkvpacked_func is not None:
+            # Reshape for FlashAttention: (total_seqlen, num_heads, head_dim)
+            query = query.view(total_seqlen, self.num_attention_heads, self.d_qk)
+            key = key.view(total_seqlen, self.num_kv_heads, self.d_qk)
+            value = value.view(total_seqlen, self.num_kv_heads, self.d_v)
+            # Apply layer norm and scaling
+            query = ((self.q_scale + 1.0).unsqueeze(0) * self.q_norm(query.float())).type_as(query)
+            key = ((self.k_scale + 1.0).unsqueeze(0) * self.k_norm(key.float())).type_as(key)
+            if v1 is None:
+                v1 = value
+            value = (1 - self.lambdas[0]) * value + self.lambdas[0] * v1
+            # Prepare qkv for FlashAttention
+            qkv = torch.stack([query, key, value], dim=1)  # (total_seqlen, 3, num_heads, head_dim)
+            # Determine window size for local attention
+            if self.window_length is not None and self.window_length > 0:
+                if self.is_causal:
+                    local_attention = (self.window_length - 1, 0)
+                else:
+                    local_attention = (self.window_length - 1, self.window_length - 1)
+            else:
+                local_attention = (-1, -1)
+            # Apply FlashAttention
+            output = flash_attention_forward(
+                qkv,
+                self.rope_embedding,
+                cu_seqlens,
+                max_seqlen,
+                self.is_causal,
+                local_attention,
+                self.config.attention_dropout if self.training else 0.0,
+                self.config.deterministic_flash_attn
+            )
+            # Reshape output back
+            output = output.view(total_seqlen, self.d_v * self.num_attention_heads)
+        else:
+            # Standard attention path
+            query_length = query.size(1)
+            key_length = key.size(1)
+            query = query.reshape(batch_size, query_length, self.num_attention_heads, self.d_qk).transpose(1, 2)
+            key = key.reshape(batch_size, key_length, self.num_kv_heads, self.d_qk).transpose(1, 2)
+            value = value.reshape(batch_size, key_length, self.num_kv_heads, self.d_v).transpose(1, 2)
+            query = ((self.q_scale + 1.0).unsqueeze(1).unsqueeze(0) * self.q_norm(query.float())).type_as(query)
+            key = ((self.k_scale + 1.0).unsqueeze(1).unsqueeze(0) * self.k_norm(key.float())).type_as(key)
+            if v1 is None:
+                v1 = value
+            else:
+                value = (1 - self.lambdas[0]) * value + self.lambdas[0] * v1
+            # Apply rotary embeddings
+            query = self.rope_embedding(query)
+            key = self.rope_embedding(key)
+            output = self.attention_operation(query, key, value, padding_info)
+            output = output.transpose(1, 2).flatten(2, 3)  # shape: [B, T, H*D]
+        output = self.inter_norm(output.float()).type_as(output)
+        output = self.out_proj(output)
+        output = self.dropout(output)
+        return output, v1
+class FeedForward(nn.Module):
+    def __init__(self, config: GptBertConfig):
+        super().__init__()
+        self.pre_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps, elementwise_affine=False)
+        self.up_proj = MultiCastedLinearOrthoIn(config.hidden_size, [config.intermediate_size, config.intermediate_size], bias=False)
+        self.activation = GeGLU()
+        self.inter_norm = nn.LayerNorm(config.intermediate_size, eps=config.layer_norm_eps, elementwise_affine=False)
+        self.down_proj = CastedLinearIn(config.intermediate_size, config.hidden_size, bias=False)
+        self.dropout = nn.Dropout(config.hidden_dropout)
+    def forward(self, x: torch.Tensor):
+        x = self.pre_norm(x.float()).type_as(x)
+        x = self.up_proj(x)
+        x = self.activation(x)
+        x = self.inter_norm(x.float()).type_as(x)
+        x = self.down_proj(x)
+        x = self.dropout(x)
+        return x
+class Layer(nn.Module):
+    def __init__(self, config: GptBertConfig, layer_idx: int):
+        super().__init__()
+        self.attention = SelfAttention(config, layer_idx)
+        self.mlp = FeedForward(config)
+        self.lambdas = nn.Parameter(torch.tensor([0., 0., 1., 0., 1., 0.]))
+    def set_window_length(self, window_length: int):
+        self.attention.set_window_length(window_length)
+    def forward(self, hidden_layer: torch.Tensor, embeddings: torch.Tensor, v1: torch.Tensor | None, padding_info):
+        attention_output = (1 - self.lambdas[0]) * hidden_layer + self.lambdas[0] * embeddings
+        qk_layer = (1 - self.lambdas[1]) * hidden_layer + self.lambdas[1] * embeddings
+        mlp_layer = F.softplus(self.lambdas[2]) * ((1 - self.lambdas[3]) * hidden_layer + self.lambdas[3] * embeddings)
+        attention_output, v1 = self.attention(attention_output, qk_layer, v1, padding_info)
+        mlp_layer = mlp_layer + attention_output
+        hidden_layer = F.softplus(self.lambdas[4]) * ((1 - self.lambdas[5]) * hidden_layer + self.lambdas[5] * embeddings)
+        output = hidden_layer + attention_output + self.mlp(mlp_layer)
+        return output, v1
+class Encoder(nn.Module):
+    def __init__(self, config: GptBertConfig):
+        super().__init__()
+        self.layers = nn.ModuleList([Layer(config, i) for i in range(config.num_layers)])
+        self.local_global_ratio = config.local_global_ratio
+    def set_window_length(self, config: GptBertConfig):
+        for i, layer in enumerate(self.layers):
+            if (i + 1) % self.local_global_ratio == 0:
+                layer.set_window_length(config.global_window_length)
+            else:
+                layer.set_window_length(config.local_window_length)
+    def forward(self, hidden_layer: torch.Tensor, padding_info, output_hidden_states=False, checkpoint_activations=False):
+        hidden_layers = [hidden_layer] if output_hidden_states else None
+        v1 = None
+        embeddings = hidden_layer
+        for layer in self.layers:
+            if checkpoint_activations:
+                hidden_layer, v1 = torch.utils.checkpoint.checkpoint(layer, hidden_layer, embeddings, v1, padding_info, use_reentrant=True)
+            else:
+                hidden_layer, v1 = layer(hidden_layer, embeddings, v1, padding_info)
+            if output_hidden_states:
+                hidden_layers.append(hidden_layer)
+        return hidden_layer, hidden_layers
+#
+# HuggingFace wrappers
+#
+class GptBertPreTrainedModel(PreTrainedModel):
+    config_class = GptBertConfig
+    supports_gradient_checkpointing = True
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_flex_attn = False
+    def _init_weights(self, module):
+        std = math.sqrt(2.0 / (5.0 * self.hidden_size))
+        if isinstance(module, nn.Linear) or isinstance(module, CastedLinearIn):
+            nn.init.trunc_normal_(module.weight.data, mean=0.0, std=std, a=-2*std, b=2*std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            nn.init.trunc_normal_(module.weight.data, mean=0.0, std=std, a=-2*std, b=2*std)
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+class GptBertModel(GptBertPreTrainedModel):
+    def __init__(self, config: GptBertConfig, add_mlm_layer=False, **kwargs):
+        super().__init__(config, **kwargs)
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.embedding = Embedding(config)
+        self.encoder = Encoder(config)
+        self.classifier = LMClassifier(config, config.vocab_size) if add_mlm_layer else None
+        self.set_window_length(config)
+        self.gradient_checkpointing = False
+        self.post_init()
+    def set_window_length(self, config) -> None:
+        self.encoder.set_window_length(config)
+    def get_input_embeddings(self):
+        return self.embedding.word_embedding
+    def set_input_embeddings(self, value):
+        self.embedding.word_embedding = value
+    def get_contextualized_embeddings(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_hidden_states: Optional[bool] = None
+    ):
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            raise ValueError("You have to specify input_ids")
+        batch_size, seq_length = input_shape
+        device = input_ids.device
+        if attention_mask is None:
+            attention_mask = torch.ones(batch_size, seq_length, dtype=torch.bool, device=device)
+        else:
+            attention_mask = attention_mask.bool()
+        if flash_attn_varlen_qkvpacked_func is not None:
+            if len(attention_mask.size()) != 2:
+                raise ValueError("Bare `attention_mask` med to dimensjoner støttes nå for FlashAttention.")
+            with torch.no_grad():
+                input_ids, indices, cu_seqlens, max_seqlen_in_batch = _unpad_input(input_ids, attention_mask)
+            padding_info = (indices, cu_seqlens, max_seqlen_in_batch)
+        else:
+            if len(attention_mask.size()) == 2:
+                attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
+            elif len(attention_mask.size()) == 3:
+                attention_mask = attention_mask.unsqueeze(1)
+            padding_info = attention_mask
+        static_embeddings = self.embedding(input_ids)
+        original_dtype = static_embeddings.dtype
+        if torch.cuda.is_available() and torch.cuda.is_bf16_supported() and static_embeddings.dtype == torch.float32:
+            static_embeddings = static_embeddings.bfloat16()
+        last_layer, contextualized_embeddings = self.encoder(
+            static_embeddings,
+            padding_info,
+            output_hidden_states=output_hidden_states,
+            checkpoint_activations=self.gradient_checkpointing and self.training
+        )
+        last_layer = last_layer.to(original_dtype)
+        if output_hidden_states:
+            contextualized_embeddings = [layer.to(original_dtype) for layer in contextualized_embeddings]
+        # Pad output if using FlashAttention
+        if flash_attn_varlen_qkvpacked_func is not None:
+            last_layer = _pad_output(last_layer, indices, batch_size, seq_length)
+            if output_hidden_states:
+                contextualized_embeddings = [_pad_output(layer, indices, batch_size, seq_length) for layer in contextualized_embeddings]
+            else:
+                contextualized_embeddings = None
+        return last_layer, contextualized_embeddings
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs
+    ) -> Union[Tuple[torch.Tensor], BaseModelOutput]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        sequence_output, contextualized_embeddings = self.get_contextualized_embeddings(input_ids, attention_mask, output_hidden_states)
+        if not return_dict:
+            return (
+                sequence_output,
+                *([contextualized_embeddings] if output_hidden_states else [])
+            )
+        return BaseModelOutput(
+            last_hidden_state=sequence_output,
+            hidden_states=contextualized_embeddings if output_hidden_states else None
+        )
+class GptBertForMaskedLM(GptBertModel):
+    _tied_weights_keys = ["classifier.emb2vocab.weight"]
+    def __init__(self, config: GptBertConfig, **kwargs):
+        super().__init__(config, add_mlm_layer=True, **kwargs)
+    def get_output_embeddings(self):
+        return self.classifier.emb2vocab.weight
+    def set_output_embeddings(self, new_embeddings):
+        self.classifier.emb2vocab.weight = new_embeddings
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: Optional[torch.LongTensor] = None,
+        **kwargs
+    ) -> Union[Tuple[torch.Tensor], MaskedLMOutput]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        sequence_output, contextualized_embeddings = self.get_contextualized_embeddings(input_ids, attention_mask, output_hidden_states)
+        subword_prediction = self.classifier(sequence_output)
+        subword_prediction = 30 * torch.sigmoid(subword_prediction / 7.5)
+        masked_lm_loss = None
+        if labels is not None:
+            labels_flatten = labels[:, 1:].flatten()
+            subword_prediction_flatten = subword_prediction[:, :-1].flatten(0, 1)
+            masked_lm_loss = F.cross_entropy(subword_prediction_flatten, labels_flatten)
+        bos_logits = torch.zeros(subword_prediction.size(0), 1, self.config.vocab_size, dtype=subword_prediction.dtype, device=subword_prediction.device)
+        bos_logits[:, :, self.config.bos_token_id] = 1.0
+        subword_prediction = torch.cat([bos_logits, subword_prediction[:, :-1]], dim=1)
+        if not return_dict:
+            output = (
+                subword_prediction,
+                *([contextualized_embeddings] if output_hidden_states else [])
+            )
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=subword_prediction,
+            hidden_states=contextualized_embeddings if output_hidden_states else None
+        )
+class GptBertForCausalLM(GptBertModel):
+    _tied_weights_keys = ["classifier.emb2vocab.weight"]
+    def __init__(self, config: GptBertConfig, **kwargs):
+        config.is_decoder = True
+        super().__init__(config, add_mlm_layer=True, **kwargs)
+    def get_output_embeddings(self):
+        return self.classifier.emb2vocab.weight
+    def set_output_embeddings(self, new_embeddings):
+        self.classifier.emb2vocab.weight = new_embeddings
+    def get_input_embeddings(self):
+        return self.embedding.word_embedding
+    def set_input_embeddings(self, value):
+        self.embedding.word_embedding = value
+    def set_decoder(self, decoder):
+        self.encoder = decoder
+    def get_decoder(self):
+        return self.encoder
+    def can_generate(self):
+        return True
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        past_key_values: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None
+    ) -> Union[Tuple, CausalLMOutput]:
+        assert inputs_embeds is None, "inputs_embeds is not supported for now"
+        assert past_key_values is None, "past_key_values is not supported for now"
+        assert not use_cache, "use_cache is not supported for now"
+        sequence_output, contextualized_embeddings = self.get_contextualized_embeddings(input_ids, attention_mask, output_hidden_states)
+        subword_prediction = self.classifier(sequence_output)
+        subword_prediction = 30 * torch.sigmoid(subword_prediction / 7.5)
+        causal_lm_loss = None
+        if labels is not None:
+            labels_flatten = labels[:, 1:].flatten()
+            subword_prediction_flatten = subword_prediction[:, :-1].flatten(0, 1)
+            causal_lm_loss = F.cross_entropy(subword_prediction_flatten, labels_flatten)
+        if not return_dict:
+            output = (
+                subword_prediction,
+                *([contextualized_embeddings] if output_hidden_states else [])
+            )
+            return ((causal_lm_loss,) + output) if masked_lm_loss is not None else output
+        return CausalLMOutput(
+            loss=causal_lm_loss,
+            logits=subword_prediction,
+            hidden_states=contextualized_embeddings if output_hidden_states else None
+        )
+    def prepare_inputs_for_generation(
+        self,
+        input_ids: torch.Tensor,
+        past_key_values: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        use_cache: bool = True,
+        num_logits_to_keep: Optional[int] = None,
+        **kwargs,
+    ):
+        # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
+        # Exception 1: when passing input_embeds, input_ids may be missing entries
+        # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
+        if past_key_values is not None:
+            if inputs_embeds is not None:  # Exception 1
+                input_ids = input_ids[:, -cache_position.shape[0] :]
+            elif input_ids.shape[1] != cache_position.shape[0]:  # Default case (the "else", a no op, is Exception 2)
+                input_ids = input_ids[:, cache_position]
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1] :]
+                # This `clone` call is needed to avoid recapturing cuda graphs with `torch.compile`'s  `mode="reduce-overhead`, as otherwise the input `position_ids` would have various stride during the decoding. Here, simply using `.contiguous()` is not sufficient as in the batch size = 1 case, `position_ids` is already contiguous but with varying stride which retriggers a capture.
+                position_ids = position_ids.clone(memory_format=torch.contiguous_format)
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and cache_position[0] == 0:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids.contiguous()}  # `contiguous()` needed for compilation use cases
+        if num_logits_to_keep is not None:
+            model_inputs["num_logits_to_keep"] = num_logits_to_keep
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "cache_position": cache_position,
+                "past_key_values": past_key_values,
+                "use_cache": use_cache,
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
+class GptBertForSequenceClassification(GptBertModel):
+    _keys_to_ignore_on_load_missing = ["classifier.emb2vocab.weight", "classifier.emb2vocab.bias"]
+    _keys_to_ignore_on_load_unexpected = ["classifier.emb2vocab.weight", "classifier.emb2vocab.bias"]
+    def __init__(self, config: GptBertConfig, **kwargs):
+        super().__init__(config, add_mlm_layer=False, **kwargs)
+        self.num_labels = config.num_labels
+        self.classifier = Classifier(config, self.num_labels)
+        self.post_init()
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: Optional[torch.LongTensor] = None,
+        **kwargs
+    ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        sequence_output, contextualized_embeddings = self.get_contextualized_embeddings(input_ids, attention_mask, output_hidden_states)
+        logits = self.classifier(sequence_output[:, 0, :])
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+            if self.config.problem_type == "regression":
+                loss_fct = nn.MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = nn.CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = nn.BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+        if not return_dict:
+            output = (
+                logits,
+                *([contextualized_embeddings] if output_hidden_states else [])
+            )
+            return ((loss,) + output) if loss is not None else output
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=contextualized_embeddings if output_hidden_states else None
+        )
+class GptBertForTokenClassification(GptBertModel):
+    _keys_to_ignore_on_load_missing = ["classifier.emb2vocab.weight", "classifier.emb2vocab.bias"]
+    _keys_to_ignore_on_load_unexpected = ["classifier.emb2vocab.weight", "classifier.emb2vocab.bias"]
+    def __init__(self, config: GptBertConfig, **kwargs):
+        super().__init__(config, add_mlm_layer=False, **kwargs)
+        self.num_labels = config.num_labels
+        self.classifier = Classifier(config, self.num_labels)
+        self.post_init()
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: Optional[torch.LongTensor] = None,
+        **kwargs
+    ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        sequence_output, contextualized_embeddings = self.get_contextualized_embeddings(input_ids, attention_mask, output_hidden_states)
+        logits = self.classifier(sequence_output)
+        loss = None
+        if labels is not None:
+            loss_fct = nn.CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+        if not return_dict:
+            output = (
+                logits,
+                *([contextualized_embeddings] if output_hidden_states else []),
+                *([attention_probs] if output_attentions else [])
+            )
+            return ((loss,) + output) if loss is not None else output
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=contextualized_embeddings if output_hidden_states else None,
+            attentions=attention_probs if output_attentions else None
+        )
+class GptBertForQuestionAnswering(GptBertModel):
+    _keys_to_ignore_on_load_missing = ["classifier.emb2vocab.weight", "classifier.emb2vocab.bias"]
+    _keys_to_ignore_on_load_unexpected = ["classifier.emb2vocab.weight", "classifier.emb2vocab.bias"]
+    def __init__(self, config: GptBertConfig, **kwargs):
+        super().__init__(config, add_mlm_layer=False, **kwargs)
+        self.num_labels = config.num_labels
+        self.classifier = Classifier(config, self.num_labels)
+        self.post_init()
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        start_positions: Optional[torch.Tensor] = None,
+        end_positions: Optional[torch.Tensor] = None,
+        **kwargs
+    ) -> Union[Tuple[torch.Tensor], QuestionAnsweringModelOutput]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        sequence_output, contextualized_embeddings = self.get_contextualized_embeddings(input_ids, attention_mask, output_hidden_states)
+        logits = self.classifier(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+            loss_fct = nn.CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+        if not return_dict:
+            output = (
+                start_logits,
+                end_logits,
+                *([contextualized_embeddings] if output_hidden_states else [])
+            )
+            return ((total_loss,) + output) if total_loss is not None else output
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=contextualized_embeddings if output_hidden_states else None
+        )
+class GptBertForMultipleChoice(GptBertModel):
+    _keys_to_ignore_on_load_missing = ["classifier.emb2vocab.weight", "classifier.emb2vocab.bias"]
+    _keys_to_ignore_on_load_unexpected = ["classifier.emb2vocab.weight", "classifier.emb2vocab.bias"]
+    def __init__(self, config: GptBertConfig, **kwargs):
+        super().__init__(config, add_mlm_layer=False, **kwargs)
+        self.num_labels = getattr(config, "num_labels", 2)
+        self.classifier = Classifier(config, self.num_labels)
+        self.post_init()
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs
+    ) -> Union[Tuple[torch.Tensor], MultipleChoiceModelOutput]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        num_choices = input_ids.shape[1]
+        flat_input_ids = input_ids.view(-1, input_ids.size(-1))
+        flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
+        sequence_output, contextualized_embeddings = self.get_contextualized_embeddings(flat_input_ids, flat_attention_mask, output_hidden_states)
+        logits = self.classifier(sequence_output)
+        reshaped_logits = logits.view(-1, num_choices)
+        loss = None
+        if labels is not None:
+            loss_fct = nn.CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+        if not return_dict:
+            output = (
+                reshaped_logits,
+                *([contextualized_embeddings] if output_hidden_states else [])
+            )
+            return ((loss,) + output) if loss is not None else output
+        return MultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=contextualized_embeddings if output_hidden_states else None
+        )

last-checkpoint/modules.json ADDED Viewed

	@@ -0,0 +1,14 @@

+[
+  {
+    "idx": 0,
+    "name": "0",
+    "path": "",
+    "type": "sentence_transformers.sparse_encoder.models.MLMTransformer"
+  },
+  {
+    "idx": 1,
+    "name": "1",
+    "path": "1_SpladePooling",
+    "type": "sentence_transformers.sparse_encoder.models.SpladePooling"
+  }
+]

last-checkpoint/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6ee91d450dde2beb96e8c2398912baa13f858a3b4f7cee07b28a9b96f3e588ef
+size 1457369077

last-checkpoint/rng_state_0.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4cdc784e3b91bc23bce54961fdaef58e6442cd03f625edf44e230178fd37f8fa
+size 14917

last-checkpoint/rng_state_1.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a36844f32afb06c561965a6f6eb81809058336e154b9d7e2fb6b83900a7ad0fa
+size 14917

last-checkpoint/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d959e23fadc9c5a5f14d7b9c3a56d1fd374b1bcf4b39d4b142f83de164ff2685
+size 1465

last-checkpoint/sentence_bert_config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+    "max_seq_length": null,
+    "do_lower_case": false
+}

last-checkpoint/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,51 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "cls_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "<mask>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

last-checkpoint/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

last-checkpoint/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,143 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "<mask>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "5": {
+      "content": "<special_0>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "6": {
+      "content": "<special_1>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "7": {
+      "content": "<special_2>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "8": {
+      "content": "<special_3>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "9": {
+      "content": "<special_4>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "10": {
+      "content": "<special_5>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "11": {
+      "content": "<special_6>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "12": {
+      "content": "<special_7>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "13": {
+      "content": "<special_8>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "14": {
+      "content": "<special_9>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "15": {
+      "content": "<special_10>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "cls_token": "<s>",
+  "eos_token": "</s>",
+  "extra_special_tokens": {},
+  "mask_token": "<mask>",
+  "model_max_length": 4096,
+  "pad_token": "<pad>",
+  "sep_token": "</s>",
+  "tokenizer_class": "PreTrainedTokenizerFast",
+  "unk_token": "<unk>"
+}

last-checkpoint/trainer_state.json ADDED Viewed

	@@ -0,0 +1,185 @@

+{
+  "best_global_step": 500,
+  "best_metric": 0.027178706104522946,
+  "best_model_checkpoint": "models/splade-norbert4-base-retrieval-only/checkpoint-500",
+  "epoch": 0.04797083373309028,
+  "eval_steps": 500,
+  "global_step": 500,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "base_loss": 37895.6328,
+      "document_regularizer_loss": 0.0598,
+      "epoch": 0.004797083373309028,
+      "grad_norm": 259787.75,
+      "learning_rate": 9.395973154362417e-07,
+      "loss": 37895.69,
+      "query_regularizer_loss": 0.001,
+      "step": 50
+    },
+    {
+      "base_loss": 10001.6025,
+      "document_regularizer_loss": 0.4482,
+      "epoch": 0.009594166746618057,
+      "grad_norm": 505128.8125,
+      "learning_rate": 1.8983700862895495e-06,
+      "loss": 10002.0562,
+      "query_regularizer_loss": 0.0037,
+      "step": 100
+    },
+    {
+      "base_loss": 3804.3779,
+      "document_regularizer_loss": 1.0922,
+      "epoch": 0.014391250119927085,
+      "grad_norm": 53786.4296875,
+      "learning_rate": 2.8571428571428573e-06,
+      "loss": 3805.4731,
+      "query_regularizer_loss": 0.0023,
+      "step": 150
+    },
+    {
+      "base_loss": 921.3414,
+      "document_regularizer_loss": 1.7523,
+      "epoch": 0.019188333493236114,
+      "grad_norm": 48469.69140625,
+      "learning_rate": 3.815915627996165e-06,
+      "loss": 923.0944,
+      "query_regularizer_loss": 0.0007,
+      "step": 200
+    },
+    {
+      "base_loss": 512.5709,
+      "document_regularizer_loss": 2.2081,
+      "epoch": 0.02398541686654514,
+      "grad_norm": 9211.7822265625,
+      "learning_rate": 4.774688398849473e-06,
+      "loss": 514.7795,
+      "query_regularizer_loss": 0.0005,
+      "step": 250
+    },
+    {
+      "base_loss": 282.497,
+      "document_regularizer_loss": 2.0475,
+      "epoch": 0.02878250023985417,
+      "grad_norm": 430.40460205078125,
+      "learning_rate": 5.733461169702781e-06,
+      "loss": 284.5449,
+      "query_regularizer_loss": 0.0003,
+      "step": 300
+    },
+    {
+      "base_loss": 88.6157,
+      "document_regularizer_loss": 1.4521,
+      "epoch": 0.0335795836131632,
+      "grad_norm": 660.3614501953125,
+      "learning_rate": 6.692233940556089e-06,
+      "loss": 90.0678,
+      "query_regularizer_loss": 0.0001,
+      "step": 350
+    },
+    {
+      "base_loss": 30.6636,
+      "document_regularizer_loss": 0.1846,
+      "epoch": 0.03837666698647223,
+      "grad_norm": 3.8934402465820312,
+      "learning_rate": 7.651006711409396e-06,
+      "loss": 30.8482,
+      "query_regularizer_loss": 0.0,
+      "step": 400
+    },
+    {
+      "base_loss": 2.5066,
+      "document_regularizer_loss": 0.0005,
+      "epoch": 0.04317375035978125,
+      "grad_norm": 26.094982147216797,
+      "learning_rate": 8.609779482262704e-06,
+      "loss": 2.5071,
+      "query_regularizer_loss": 0.0,
+      "step": 450
+    },
+    {
+      "base_loss": 1.3518,
+      "document_regularizer_loss": 0.0007,
+      "epoch": 0.04797083373309028,
+      "grad_norm": 20.548200607299805,
+      "learning_rate": 9.568552253116012e-06,
+      "loss": 1.3525,
+      "query_regularizer_loss": 0.0,
+      "step": 500
+    },
+    {
+      "epoch": 0.04797083373309028,
+      "eval_NanoBEIR_mean_avg_flops": 51200.0,
+      "eval_NanoBEIR_mean_corpus_active_dims": 51200.0,
+      "eval_NanoBEIR_mean_corpus_sparsity_ratio": 0.0,
+      "eval_NanoBEIR_mean_dot_accuracy@1": 0.02,
+      "eval_NanoBEIR_mean_dot_accuracy@10": 0.12,
+      "eval_NanoBEIR_mean_dot_accuracy@3": 0.08,
+      "eval_NanoBEIR_mean_dot_accuracy@5": 0.08,
+      "eval_NanoBEIR_mean_dot_map@100": 0.006747512755501429,
+      "eval_NanoBEIR_mean_dot_mrr@10": 0.05088888888888889,
+      "eval_NanoBEIR_mean_dot_ndcg@10": 0.027178706104522946,
+      "eval_NanoBEIR_mean_dot_precision@1": 0.02,
+      "eval_NanoBEIR_mean_dot_precision@10": 0.026000000000000006,
+      "eval_NanoBEIR_mean_dot_precision@3": 0.03333333333333333,
+      "eval_NanoBEIR_mean_dot_precision@5": 0.032,
+      "eval_NanoBEIR_mean_dot_recall@1": 7.905138339920947e-05,
+      "eval_NanoBEIR_mean_dot_recall@10": 0.006349071275176555,
+      "eval_NanoBEIR_mean_dot_recall@3": 0.003312410422185988,
+      "eval_NanoBEIR_mean_dot_recall@5": 0.004545769460972766,
+      "eval_NanoBEIR_mean_query_active_dims": 51200.0,
+      "eval_NanoBEIR_mean_query_sparsity_ratio": 0.0,
+      "eval_NanoNFCorpus_avg_flops": 51200.0,
+      "eval_NanoNFCorpus_corpus_active_dims": 51200.0,
+      "eval_NanoNFCorpus_corpus_sparsity_ratio": 0.0,
+      "eval_NanoNFCorpus_dot_accuracy@1": 0.02,
+      "eval_NanoNFCorpus_dot_accuracy@10": 0.12,
+      "eval_NanoNFCorpus_dot_accuracy@3": 0.08,
+      "eval_NanoNFCorpus_dot_accuracy@5": 0.08,
+      "eval_NanoNFCorpus_dot_map@100": 0.006747512755501429,
+      "eval_NanoNFCorpus_dot_mrr@10": 0.05088888888888889,
+      "eval_NanoNFCorpus_dot_ndcg@10": 0.027178706104522946,
+      "eval_NanoNFCorpus_dot_precision@1": 0.02,
+      "eval_NanoNFCorpus_dot_precision@10": 0.026000000000000006,
+      "eval_NanoNFCorpus_dot_precision@3": 0.03333333333333333,
+      "eval_NanoNFCorpus_dot_precision@5": 0.032,
+      "eval_NanoNFCorpus_dot_recall@1": 7.905138339920947e-05,
+      "eval_NanoNFCorpus_dot_recall@10": 0.006349071275176555,
+      "eval_NanoNFCorpus_dot_recall@3": 0.003312410422185988,
+      "eval_NanoNFCorpus_dot_recall@5": 0.004545769460972766,
+      "eval_NanoNFCorpus_query_active_dims": 51200.0,
+      "eval_NanoNFCorpus_query_sparsity_ratio": 0.0,
+      "eval_base_loss": 2.2657,
+      "eval_document_regularizer_loss": 0.0006,
+      "eval_loss": 2.2663323879241943,
+      "eval_query_regularizer_loss": 0.0,
+      "eval_runtime": 364.216,
+      "eval_samples_per_second": 39.696,
+      "eval_steps_per_second": 0.621,
+      "step": 500
+    }
+  ],
+  "logging_steps": 50,
+  "max_steps": 10423,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 16,
+  "trial_name": null,
+  "trial_params": null
+}

last-checkpoint/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:80a238e2a37bc1da8ffa7f6ac3192a8f23e297afa1bc8b0a28a0d40a8e101359
+size 6353