Initial model upload - clean repository
Browse files- .gitattributes +7 -0
- README.md +292 -0
- config.json +43 -0
- large_mlm_vs_downstream.png +0 -0
- model.onnx +3 -0
- model.safetensors +3 -0
- model_performance_2d.png +3 -0
- plots/eval_loss.png +3 -0
- plots/eval_masked_accuracy.png +3 -0
- plots/grad_L2_norm.png +3 -0
- plots/lr_schedule.png +3 -0
- plots/train_loss.png +3 -0
- plots/train_masked_accuracy.png +3 -0
- pytorch_model.bin +3 -0
- special_tokens_map.json +51 -0
- tokenizer.json +0 -0
- tokenizer_config.json +58 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,10 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
plots/eval_loss.png filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
plots/eval_masked_accuracy.png filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
plots/grad_L2_norm.png filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
plots/train_loss.png filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
plots/train_masked_accuracy.png filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
plots/lr_schedule.png filter=lfs diff=lfs merge=lfs -text
|
| 42 |
+
model_performance_2d.png filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
|
@@ -0,0 +1,292 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
language:
|
| 3 |
+
- tr
|
| 4 |
+
- en
|
| 5 |
+
license: apache-2.0
|
| 6 |
+
tags:
|
| 7 |
+
- fill-mask
|
| 8 |
+
- turkish
|
| 9 |
+
- legal
|
| 10 |
+
- turkish-legal
|
| 11 |
+
- mecellem
|
| 12 |
+
- modernbert
|
| 13 |
+
- TRUBA
|
| 14 |
+
- MN5
|
| 15 |
+
base_model: ModernBERT-large
|
| 16 |
+
pipeline_tag: fill-mask
|
| 17 |
+
---
|
| 18 |
+
|
| 19 |
+
# Mursit-Large
|
| 20 |
+
|
| 21 |
+
[](https://github.com/newmindai/mecellem-models) [](https://huggingface.co/spaces/newmindai/Mizan) [](https://opensource.org/licenses/Apache-2.0)
|
| 22 |
+
|
| 23 |
+
## Model Description
|
| 24 |
+
|
| 25 |
+
Mursit-Large is a large-scale Turkish Masked Language Model pre-trained entirely from scratch on Turkish-dominant corpora. The model is based on ModernBERT-large architecture (403M parameters) and serves as a foundation model for downstream tasks. Unlike domain-adaptive approaches that continue training from existing checkpoints, this model is initialized randomly and trained on a carefully curated dataset combining Turkish legal text with general web data.
|
| 26 |
+
|
| 27 |
+
**Key Features:**
|
| 28 |
+
- Pre-trained from scratch on approximately 112.7 billion tokens of Turkish-dominant corpus
|
| 29 |
+
- Achieves 60.76% MLM accuracy on Turkish datasets (80-10-10 masking strategy, evaluated at 15% masking rate)
|
| 30 |
+
- Serves as foundation for downstream embedding tasks (Mursit-Large-TR-Retrieval)
|
| 31 |
+
- Custom tokenizer optimized for Turkish morphological structure
|
| 32 |
+
- Pre-trained with 30% masking rate (ModernBERT/RoBERTa approach) but evaluated at 15% masking rate for fair comparison
|
| 33 |
+
|
| 34 |
+
**Model Type:** Masked Language Model (MLM)
|
| 35 |
+
**Parameters:** 403M
|
| 36 |
+
**Base Architecture:** ModernBERT-large
|
| 37 |
+
**Hidden Size:** 1,024
|
| 38 |
+
**Max Sequence Length:** 2,048 tokens
|
| 39 |
+
|
| 40 |
+
### Architecture Details
|
| 41 |
+
|
| 42 |
+
- **Layers:** 28 transformer layers
|
| 43 |
+
- **Hidden Size:** 1,024
|
| 44 |
+
- **FFN Size:** 2,624
|
| 45 |
+
- **Attention Heads:** 16 heads with 64 dimensions each
|
| 46 |
+
- **Activation:** GeGLU (Gated Linear Units with GELU)
|
| 47 |
+
- **Normalization:** RMSNorm
|
| 48 |
+
- **Position Embeddings:** Rotary positional embeddings (RoPE) with θ=20,000
|
| 49 |
+
- **Window Size:** 128
|
| 50 |
+
- **Vocabulary Size:** 59,008 tokens
|
| 51 |
+
|
| 52 |
+
### Training Details
|
| 53 |
+
|
| 54 |
+
**Pre-training:**
|
| 55 |
+
- **Dataset:** Turkish-dominant corpus totaling approximately 112.7 billion tokens
|
| 56 |
+
- **Legal Sources:**
|
| 57 |
+
- Court of Cassation (Yargıtay): 10.3M sequences, ~3.43B tokens
|
| 58 |
+
- Council of State (Danıştay): 151K sequences, ~0.11B tokens
|
| 59 |
+
- Academic theses (YÖKTEZ): 21.1M sequences, ~9.61B tokens (after DocsOCR processing)
|
| 60 |
+
- **General Turkish Sources:**
|
| 61 |
+
- FineWeb2: General Turkish web data
|
| 62 |
+
- CulturaX: Multilingual corpus (Turkish subset)
|
| 63 |
+
- Total general Turkish: 212M sequences, ~96.17B tokens
|
| 64 |
+
- **Data Processing:** SemHash-based semantic deduplication, FineWeb quality filtering, URL-based filtering, page-packing for YÖKTEZ documents
|
| 65 |
+
- **Training Method:** Masked Language Modeling (MLM) with 15% masking probability
|
| 66 |
+
- **Masking Strategy:** 80% [MASK], 10% random token, 10% unchanged (80-10-10 strategy)
|
| 67 |
+
- **Framework:** MosaicML Composer with Decoupled StableAdamW optimizer
|
| 68 |
+
- **Learning Rate:** 8×10⁻⁴ with warmup_stable_decay schedule
|
| 69 |
+
- **Precision:** BF16 mixed precision
|
| 70 |
+
- **Hardware:** MareNostrum 5 supercomputer (BSC), 128×H100 GPUs
|
| 71 |
+
|
| 72 |
+
**MLM Accuracy:** 67.25% (evaluated on Turkish datasets: blackerx/turkish_v2, fthbrmnby/turkish_product_reviews, hazal/Turkish-Biomedical-corpus-trM, newmindai/EuroHPC-Legal)
|
| 73 |
+
|
| 74 |
+
### MLM Accuracy Scores (80-10-10 Strategy) on Turkish Datasets
|
| 75 |
+
|
| 76 |
+
The following table presents MLM accuracy scores (averaged across the 80-10-10 strategy) for our pre-trained models and baseline MLM models evaluated on Turkish datasets. *This model's results are highlighted in italics.*
|
| 77 |
+
|
| 78 |
+
| Model | MLM Avg (%) |
|
| 79 |
+
|-------|-------------|
|
| 80 |
+
| boun-tabilab/TabiBERT | **69.57** |
|
| 81 |
+
| *newmindai/Mursit-Large* | *67.25* |
|
| 82 |
+
| ytu-ce-cosmos/turkish-large-bert-cased | 65.03 |
|
| 83 |
+
| dbmdz/bert-base-turkish-cased | 64.98 |
|
| 84 |
+
| newmindai/Mursit-Base | 64.05 |
|
| 85 |
+
| KocLab-Bilkent/BERTurk-Legal | 54.10 |
|
| 86 |
+
| ytu-ce-cosmos/turkish-base-bert-uncased | 52.69 |
|
| 87 |
+
|
| 88 |
+
*MLM accuracy averaged across the 80-10-10 masking strategy. turkish-base-bert-uncased was evaluated only on uncased datasets. Evaluation datasets: blackerx/turkish_v2, fthbrmnby/turkish_product_reviews, hazal/Turkish-Biomedical-corpus-trM, newmindai/EuroHPC-Legal. All experiments are reproducible (see Section A.2 in the paper).*
|
| 89 |
+
|
| 90 |
+
## Performance on MTEB-Turkish Benchmark
|
| 91 |
+
|
| 92 |
+
The following visualization shows the model's performance compared to other Turkish language models:
|
| 93 |
+
|
| 94 |
+

|
| 95 |
+
|
| 96 |
+
*Model Performance Comparison: Legal Score vs. MTEB Score. MLM models (blue circles) form a distinct cluster. Mursit-Large achieves competitive performance among Turkish MLM models.*
|
| 97 |
+
|
| 98 |
+
This model was evaluated on the comprehensive MTEB-Turkish benchmark for embedding tasks using mean pooling over token representations followed by L2 normalization.
|
| 99 |
+
|
| 100 |
+
### Comprehensive Benchmark Results
|
| 101 |
+
|
| 102 |
+
The following table presents comprehensive evaluation results across all models evaluated on the MTEB-Turkish benchmark. *This model's results are highlighted in italics.*
|
| 103 |
+
|
| 104 |
+
| Model | MTEB | Legal | Cls. | Clus. | Pair | Ret. | STS | Cont. | Reg. | Case | Params | Type |
|
| 105 |
+
|-------|------|-------|------|-------|------|------|-----|-------|------|------|--------|------|
|
| 106 |
+
| embeddinggemma-300m | **65.42** | 50.63 | **77.74** | **45.05** | **80.02** | **55.06** | 69.22 | 83.97 | **39.56** | 28.38 | 307M | Emb. |
|
| 107 |
+
| bge-m3 | 62.87 | **51.16** | 75.35 | 35.86 | 78.88 | 54.42 | **69.83** | **86.08** | 38.09 | **29.3** | 567M | Emb. |
|
| 108 |
+
| Mursit-Embed-Qwen3-1.7B-TR | 56.84 | 34.76 | 68.46 | 42.22 | 59.67 | 50.1 | 63.77 | 70.22 | 17.94 | 16.11 | 1.7B | CLM-E. |
|
| 109 |
+
| Mursit-Large-TR-Retrieval | 56.87 | 46.56 | 67.72 | 41.15 | 59.78 | 51.69 | 64.01 | 81.78 | 32.67 | 25.24 | 403M | Emb. |
|
| 110 |
+
| Mursit-Base-TR-Retrieval | 55.86 | 47.52 | 66.25 | 39.75 | 61.31 | 50.07 | 61.9 | 80.4 | 34.1 | 28.07 | 155M | Emb. |
|
| 111 |
+
| Mursit-Embed-Qwen3-4B-TR | 53.65 | 37.0 | 67.29 | 36.68 | 58.36 | 51.12 | 54.77 | 69.25 | 24.21 | 17.56 | 4B | CLM-E. |
|
| 112 |
+
|-------|------|-------|------|------|------|------|-----|-------|------|------|--------|------|
|
| 113 |
+
| bert-base-turkish-uncased | 46.23 | 24.94 | 68.05 | 33.81 | 60.44 | 32.01 | 36.85 | 52.47 | 12.05 | 10.29 | 110M | MLM |
|
| 114 |
+
| turkish-large-bert-cased | 45.3 | 19.12 | 67.43 | 34.24 | 60.11 | 28.68 | 36.04 | 47.57 | 5.93 | 3.85 | 337M | MLM |
|
| 115 |
+
| bert-base-turkish-cased | 45.17 | 24.41 | 66.39 | 35.28 | 60.05 | 30.52 | 33.62 | 54.03 | 10.13 | 9.07 | 110M | MLM |
|
| 116 |
+
| BERTurk-Legal | 42.02 | 32.63 | 60.61 | 26.24 | 59.51 | 25.8 | 37.94 | 61.4 | 15.51 | 20.99 | 184M | MLM |
|
| 117 |
+
| *Mursit-Large* | 41.75 | 23.71 | 62.95 | 25.34 | 58.04 | 27.4 | 35.01 | 42.74 | 11.29 | 17.1 | 403M | MLM |
|
| 118 |
+
| turkish-base-bert-uncased | 44.68 | 27.58 | 66.22 | 30.23 | 58.84 | 31.4 | 36.74 | 56.6 | 13.39 | 12.74 | 110M | MLM |
|
| 119 |
+
| Mursit-Base | 40.23 | 17.93 | 59.78 | 25.48 | 58.65 | 20.82 | 36.45 | 36.0 | 7.4 | 10.4 | 155M | MLM |
|
| 120 |
+
| mmBERT-base | 39.65 | 12.15 | 61.84 | 26.77 | 59.25 | 15.83 | 34.56 | 34.45 | 1.33 | 0.68 | 306M | MLM |
|
| 121 |
+
| TabiBERT | 37.77 | 11.5 | 59.63 | 25.75 | 58.19 | 14.96 | 30.32 | 32.02 | 1.86 | 0.63 | 148M | MLM |
|
| 122 |
+
| ModernBERT-base | 23.8 | 2.99 | 39.06 | 2.01 | 53.95 | 2.1 | 21.91 | 7.92 | 0.62 | 0.43 | 149M | MLM |
|
| 123 |
+
| ModernBERT-large | 23.74 | 2.44 | 39.44 | 3.9 | 53.73 | 1.8 | 19.85 | 6.12 | 0.62 | 0.59 | 394M | MLM |
|
| 124 |
+
|
| 125 |
+
**Column abbreviations:** MTEB = mean performance across task types; Legal = weighted average of Contracts, Regulation, Caselaw; Classification = accuracy on Turkish classification tasks; Clustering = V-measure on clustering tasks; Pair Classification = average precision on pair classification tasks like NLI; Retrieval = nDCG@10 on information retrieval tasks; Semantic Textual Similarity = Spearman correlation; Contracts = nDCG@10 on legal contract retrieval; Regulation = nDCG@10 on regulatory text retrieval; Caselaw = nDCG@10 on case law retrieval; Number of Parameters = number of model parameters; Model Type = model type (Embedding, CLM-Embedding, Masked Language Model). **Bold values** indicate the highest score in each column.
|
| 126 |
+
|
| 127 |
+
**Key Findings:**
|
| 128 |
+
- The model shows substantial improvement over ModernBERT baselines (which are monolingual English models), validating the effectiveness of Turkish-specific pre-training
|
| 129 |
+
- Pre-training alone without embedding-specific fine-tuning yields limited utility for retrieval tasks
|
| 130 |
+
- Language-specific pre-training is critical, as monolingual English models show limited cross-lingual transfer to Turkish
|
| 131 |
+
- The model demonstrates that improvements in MLM accuracy do not always directly translate to better downstream task performance
|
| 132 |
+
|
| 133 |
+
### MLM vs Downstream Performance Analysis
|
| 134 |
+
|
| 135 |
+
The following visualization shows the relationship between MLM validation loss and downstream retrieval performance:
|
| 136 |
+
|
| 137 |
+

|
| 138 |
+
|
| 139 |
+
*Relationship between MLM validation loss and downstream retrieval performance across ModernBERT-large versions. This analysis demonstrates how improvements in MLM accuracy correlate with downstream task performance.*
|
| 140 |
+
|
| 141 |
+
**Note:** This model is primarily designed for Masked Language Modeling tasks. Embedding performance is provided for reference using standard mean pooling. For optimal retrieval performance, consider using the post-trained retrieval variants (Mursit-Base-TR-Retrieval or Mursit-Large-TR-Retrieval).
|
| 142 |
+
|
| 143 |
+
## Reproducibility
|
| 144 |
+
|
| 145 |
+
To reproduce the MLM benchmark results for this model, please refer to:
|
| 146 |
+
|
| 147 |
+
- **MLM Benchmark Results:** [github.com/newmindai/mecellem-models/benchmark/mlm](https://github.com/newmindai/mecellem-models/tree/main/benchmark/mlm) - Contains code and evaluation configurations for reproducing MLM accuracy scores on Turkish datasets using the 80-10-10 masking strategy.
|
| 148 |
+
|
| 149 |
+
## Usage
|
| 150 |
+
|
| 151 |
+
### Installation
|
| 152 |
+
|
| 153 |
+
```bash
|
| 154 |
+
pip install transformers torch
|
| 155 |
+
```
|
| 156 |
+
|
| 157 |
+
### Masked Language Modeling
|
| 158 |
+
|
| 159 |
+
```python
|
| 160 |
+
from transformers import AutoTokenizer, AutoModelForMaskedLM
|
| 161 |
+
import torch
|
| 162 |
+
|
| 163 |
+
tokenizer = AutoTokenizer.from_pretrained("newmindai/Mursit-Large")
|
| 164 |
+
model = AutoModelForMaskedLM.from_pretrained("newmindai/Mursit-Large")
|
| 165 |
+
|
| 166 |
+
text = "Türkiye Cumhuriyeti'nin başkenti [MASK]'dir."
|
| 167 |
+
inputs = tokenizer(text, return_tensors="pt")
|
| 168 |
+
|
| 169 |
+
with torch.no_grad():
|
| 170 |
+
outputs = model(**inputs)
|
| 171 |
+
mask_token_index = (inputs["input_ids"] == tokenizer.mask_token_id)[0].nonzero(as_tuple=True)[0]
|
| 172 |
+
predictions = torch.nn.functional.softmax(outputs.logits[0, mask_token_index], dim=-1)
|
| 173 |
+
|
| 174 |
+
top_k = 5
|
| 175 |
+
top_indices = torch.topk(predictions[0], top_k).indices
|
| 176 |
+
for idx in top_indices:
|
| 177 |
+
token = tokenizer.decode([idx])
|
| 178 |
+
score = predictions[0][idx].item()
|
| 179 |
+
print(f"{token}: {score:.4f}")
|
| 180 |
+
```
|
| 181 |
+
# ONNX Model Inference - Masked Language Modeling (MLM)
|
| 182 |
+
|
| 183 |
+
This script demonstrates how to use the ONNX model from Hugging Face for masked language modeling tasks.
|
| 184 |
+
|
| 185 |
+
## Exporting Model to ONNX
|
| 186 |
+
|
| 187 |
+
To export the model to ONNX format for MLM, use the `optimum-cli` command:
|
| 188 |
+
|
| 189 |
+
```bash
|
| 190 |
+
optimum-cli export onnx \
|
| 191 |
+
-m newmindai/Mursit-Large \
|
| 192 |
+
--task fill-mask \
|
| 193 |
+
onnx/MursitLarge
|
| 194 |
+
```
|
| 195 |
+
|
| 196 |
+
This will create the `model.onnx` file in the specified output directory.
|
| 197 |
+
|
| 198 |
+
## Installation
|
| 199 |
+
|
| 200 |
+
```bash
|
| 201 |
+
pip install onnxruntime-gpu transformers huggingface_hub numpy
|
| 202 |
+
```
|
| 203 |
+
|
| 204 |
+
## Usage
|
| 205 |
+
|
| 206 |
+
```python
|
| 207 |
+
import numpy as np
|
| 208 |
+
import onnxruntime as ort
|
| 209 |
+
from transformers import AutoTokenizer
|
| 210 |
+
from huggingface_hub import hf_hub_download
|
| 211 |
+
|
| 212 |
+
repo_id = "newmindai/Mursit-Large"
|
| 213 |
+
|
| 214 |
+
onnx_path = hf_hub_download(repo_id, "model.onnx")
|
| 215 |
+
|
| 216 |
+
tokenizer = AutoTokenizer.from_pretrained(repo_id)
|
| 217 |
+
|
| 218 |
+
sess = ort.InferenceSession(
|
| 219 |
+
onnx_path,
|
| 220 |
+
providers=["CUDAExecutionProvider", "CPUExecutionProvider"]
|
| 221 |
+
)
|
| 222 |
+
|
| 223 |
+
text = f"Bu bir {tokenizer.mask_token} cümledir."
|
| 224 |
+
inputs = tokenizer(text, return_tensors="np")
|
| 225 |
+
|
| 226 |
+
outputs = sess.run(None, dict(inputs))
|
| 227 |
+
logits = outputs[0]
|
| 228 |
+
|
| 229 |
+
mask_pos = np.where(inputs["input_ids"][0] == tokenizer.mask_token_id)[0][0]
|
| 230 |
+
mask_logits = logits[0, mask_pos]
|
| 231 |
+
|
| 232 |
+
top_k = 5
|
| 233 |
+
top_k_ids = np.argsort(mask_logits)[-top_k:][::-1]
|
| 234 |
+
predictions = tokenizer.convert_ids_to_tokens(top_k_ids)
|
| 235 |
+
|
| 236 |
+
print("MASK predictions:")
|
| 237 |
+
for p in predictions:
|
| 238 |
+
print(p)
|
| 239 |
+
```
|
| 240 |
+
|
| 241 |
+
## Features
|
| 242 |
+
|
| 243 |
+
- **Automatic GPU/CPU selection**: Uses CUDA if available, otherwise falls back to CPU
|
| 244 |
+
- **Hugging Face integration**: Downloads model files directly from Hugging Face Hub
|
| 245 |
+
- **Masked token prediction**: Predicts the most likely tokens for masked positions
|
| 246 |
+
- **Top-K predictions**: Returns the top K most probable token predictions
|
| 247 |
+
|
| 248 |
+
## Use Cases
|
| 249 |
+
|
| 250 |
+
- Turkish language understanding tasks
|
| 251 |
+
- Text classification
|
| 252 |
+
- Named entity recognition
|
| 253 |
+
- Question answering
|
| 254 |
+
- Feature extraction for downstream tasks
|
| 255 |
+
|
| 256 |
+
## Reproducibility
|
| 257 |
+
|
| 258 |
+
To reproduce the MLM benchmark results for this model, please refer to:
|
| 259 |
+
|
| 260 |
+
- **MLM Benchmark Results:** [github.com/newmindai/mecellem-models/benchmark/mlm](https://github.com/newmindai/mecellem-models/tree/main/benchmark/mlm) - Contains code and evaluation configurations for reproducing MLM accuracy scores on Turkish datasets using the 80-10-10 masking strategy.
|
| 261 |
+
|
| 262 |
+
## Acknowledgments
|
| 263 |
+
|
| 264 |
+
This work was supported by the EuroHPC Joint Undertaking through project etur46 with access to the MareNostrum 5 supercomputer, hosted by Barcelona Supercomputing Center (BSC), Spain. MareNostrum 5 is owned by EuroHPC JU and operated by BSC. We are grateful to the BSC support team for their assistance with job scheduling, environment configuration, and technical guidance throughout the project.
|
| 265 |
+
|
| 266 |
+
The numerical calculations reported in this work were fully/partially performed at TÜBİTAK ULAKBİM, High Performance and Grid Computing Center (TRUBA resources). The authors gratefully acknowledge the know-how provided by the MINERVA Support for expert guidance and collaboration opportunities in HPC-AI integration.
|
| 267 |
+
|
| 268 |
+
## References
|
| 269 |
+
|
| 270 |
+
If you use this model, please cite our paper:
|
| 271 |
+
|
| 272 |
+
```bibtex
|
| 273 |
+
@article{mecellem2026,
|
| 274 |
+
title={Mecellem Models: Turkish Models Trained from Scratch and Continually Pre-trained for the Legal Domain},
|
| 275 |
+
author={Uğur, Özgür and Göksu, Mahmut and Şavirdi, Esra and Çimen, Mahmut and Yılmaz, Musa and Demir, Alp Talha and Güllüce, Rumeysa and Çetin, İclal and Sağbaş, Ömer Can},
|
| 276 |
+
journal={Procedia Computer Science},
|
| 277 |
+
year={2026},
|
| 278 |
+
publisher={Elsevier}
|
| 279 |
+
}
|
| 280 |
+
```
|
| 281 |
+
### Base Model References
|
| 282 |
+
|
| 283 |
+
```bibtex
|
| 284 |
+
@inproceedings{modernbert2025,
|
| 285 |
+
title={ModernBERT: A Modern Bidirectional Encoder Transformer},
|
| 286 |
+
author={Answer.AI and LightOn},
|
| 287 |
+
booktitle={Proceedings of the 2025 Conference on Language Models},
|
| 288 |
+
year={2025}
|
| 289 |
+
}
|
| 290 |
+
```
|
| 291 |
+
|
| 292 |
+
<!-- Updated: 2026-01-15 09:38:24 -->
|
config.json
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"ModernBertForMaskedLM"
|
| 4 |
+
],
|
| 5 |
+
"attention_bias": false,
|
| 6 |
+
"attention_dropout": 0.0,
|
| 7 |
+
"bos_token_id": 1,
|
| 8 |
+
"classifier_activation": "silu",
|
| 9 |
+
"classifier_bias": false,
|
| 10 |
+
"classifier_dropout": 0.0,
|
| 11 |
+
"classifier_pooling": "mean",
|
| 12 |
+
"cls_token_id": 1,
|
| 13 |
+
"decoder_bias": true,
|
| 14 |
+
"deterministic_flash_attn": false,
|
| 15 |
+
"embedding_dropout": 0.0,
|
| 16 |
+
"eos_token_id": 2,
|
| 17 |
+
"global_attn_every_n_layers": 3,
|
| 18 |
+
"global_rope_theta": 20000.0,
|
| 19 |
+
"gradient_checkpointing": false,
|
| 20 |
+
"hidden_activation": "gelu",
|
| 21 |
+
"hidden_size": 1024,
|
| 22 |
+
"initializer_cutoff_factor": 2.0,
|
| 23 |
+
"initializer_range": 0.02,
|
| 24 |
+
"intermediate_size": 2624,
|
| 25 |
+
"layer_norm_eps": 1e-05,
|
| 26 |
+
"local_attention": 128,
|
| 27 |
+
"local_rope_theta": 20000.0,
|
| 28 |
+
"max_position_embeddings": 2048,
|
| 29 |
+
"mlp_bias": false,
|
| 30 |
+
"mlp_dropout": 0.0,
|
| 31 |
+
"model_type": "modernbert",
|
| 32 |
+
"norm_bias": false,
|
| 33 |
+
"norm_eps": 1e-05,
|
| 34 |
+
"num_attention_heads": 16,
|
| 35 |
+
"num_hidden_layers": 28,
|
| 36 |
+
"pad_token_id": 0,
|
| 37 |
+
"position_embedding_type": "absolute",
|
| 38 |
+
"sep_token_id": 2,
|
| 39 |
+
"tie_word_embeddings": true,
|
| 40 |
+
"torch_dtype": "bfloat16",
|
| 41 |
+
"transformers_version": "4.48.0",
|
| 42 |
+
"vocab_size": 59008
|
| 43 |
+
}
|
large_mlm_vs_downstream.png
ADDED
|
model.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:37177cc0b0d406fbcdc2c6120a56410b1889c06fcd2c9379cc016e1935e1228f
|
| 3 |
+
size 1619728607
|
model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:33416153179089afb9bb2150e1d20e9ba2d24a4d16550ccceb88cb27571a6e88
|
| 3 |
+
size 1618968816
|
model_performance_2d.png
ADDED
|
Git LFS Details
|
plots/eval_loss.png
ADDED
|
Git LFS Details
|
plots/eval_masked_accuracy.png
ADDED
|
Git LFS Details
|
plots/grad_L2_norm.png
ADDED
|
Git LFS Details
|
plots/lr_schedule.png
ADDED
|
Git LFS Details
|
plots/train_loss.png
ADDED
|
Git LFS Details
|
plots/train_masked_accuracy.png
ADDED
|
Git LFS Details
|
pytorch_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:276f10308ec4c7fc5a54c1df036d0bf65f36bd20fab2d29d0c5f5412ccd2b717
|
| 3 |
+
size 1619005182
|
special_tokens_map.json
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": {
|
| 3 |
+
"content": "<s>",
|
| 4 |
+
"lstrip": false,
|
| 5 |
+
"normalized": false,
|
| 6 |
+
"rstrip": false,
|
| 7 |
+
"single_word": false
|
| 8 |
+
},
|
| 9 |
+
"cls_token": {
|
| 10 |
+
"content": "<s>",
|
| 11 |
+
"lstrip": false,
|
| 12 |
+
"normalized": false,
|
| 13 |
+
"rstrip": false,
|
| 14 |
+
"single_word": false
|
| 15 |
+
},
|
| 16 |
+
"eos_token": {
|
| 17 |
+
"content": "</s>",
|
| 18 |
+
"lstrip": false,
|
| 19 |
+
"normalized": false,
|
| 20 |
+
"rstrip": false,
|
| 21 |
+
"single_word": false
|
| 22 |
+
},
|
| 23 |
+
"mask_token": {
|
| 24 |
+
"content": "<mask>",
|
| 25 |
+
"lstrip": true,
|
| 26 |
+
"normalized": false,
|
| 27 |
+
"rstrip": false,
|
| 28 |
+
"single_word": false
|
| 29 |
+
},
|
| 30 |
+
"pad_token": {
|
| 31 |
+
"content": "[PAD]",
|
| 32 |
+
"lstrip": false,
|
| 33 |
+
"normalized": false,
|
| 34 |
+
"rstrip": false,
|
| 35 |
+
"single_word": false
|
| 36 |
+
},
|
| 37 |
+
"sep_token": {
|
| 38 |
+
"content": "</s>",
|
| 39 |
+
"lstrip": false,
|
| 40 |
+
"normalized": false,
|
| 41 |
+
"rstrip": false,
|
| 42 |
+
"single_word": false
|
| 43 |
+
},
|
| 44 |
+
"unk_token": {
|
| 45 |
+
"content": "<unk>",
|
| 46 |
+
"lstrip": false,
|
| 47 |
+
"normalized": false,
|
| 48 |
+
"rstrip": false,
|
| 49 |
+
"single_word": false
|
| 50 |
+
}
|
| 51 |
+
}
|
tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tokenizer_config.json
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"added_tokens_decoder": {
|
| 3 |
+
"0": {
|
| 4 |
+
"content": "[PAD]",
|
| 5 |
+
"lstrip": false,
|
| 6 |
+
"normalized": false,
|
| 7 |
+
"rstrip": false,
|
| 8 |
+
"single_word": false,
|
| 9 |
+
"special": true
|
| 10 |
+
},
|
| 11 |
+
"1": {
|
| 12 |
+
"content": "<s>",
|
| 13 |
+
"lstrip": false,
|
| 14 |
+
"normalized": false,
|
| 15 |
+
"rstrip": false,
|
| 16 |
+
"single_word": false,
|
| 17 |
+
"special": true
|
| 18 |
+
},
|
| 19 |
+
"2": {
|
| 20 |
+
"content": "</s>",
|
| 21 |
+
"lstrip": false,
|
| 22 |
+
"normalized": false,
|
| 23 |
+
"rstrip": false,
|
| 24 |
+
"single_word": false,
|
| 25 |
+
"special": true
|
| 26 |
+
},
|
| 27 |
+
"3": {
|
| 28 |
+
"content": "<unk>",
|
| 29 |
+
"lstrip": false,
|
| 30 |
+
"normalized": false,
|
| 31 |
+
"rstrip": false,
|
| 32 |
+
"single_word": false,
|
| 33 |
+
"special": true
|
| 34 |
+
},
|
| 35 |
+
"4": {
|
| 36 |
+
"content": "<mask>",
|
| 37 |
+
"lstrip": true,
|
| 38 |
+
"normalized": false,
|
| 39 |
+
"rstrip": false,
|
| 40 |
+
"single_word": false,
|
| 41 |
+
"special": true
|
| 42 |
+
}
|
| 43 |
+
},
|
| 44 |
+
"bos_token": "<s>",
|
| 45 |
+
"clean_up_tokenization_spaces": false,
|
| 46 |
+
"cls_token": "<s>",
|
| 47 |
+
"eos_token": "</s>",
|
| 48 |
+
"mask_token": "<mask>",
|
| 49 |
+
"model_max_length": 2048,
|
| 50 |
+
"pad_token": "[PAD]",
|
| 51 |
+
"sep_token": "</s>",
|
| 52 |
+
"tokenizer_class": "PreTrainedTokenizerFast",
|
| 53 |
+
"unk_token": "<unk>",
|
| 54 |
+
"model_input_names": [
|
| 55 |
+
"input_ids",
|
| 56 |
+
"attention_mask"
|
| 57 |
+
]
|
| 58 |
+
}
|