Add RoFormer-slav
Browse files- README.md +70 -0
- aeneas_decode.py +234 -0
- config.json +28 -0
- configuration_roformer.py +18 -0
- model.safetensors +3 -0
- modeling_roformer.py +11 -0
- special_tokens_map.json +18 -0
- tokenizer.json +0 -0
- tokenizer_config.json +67 -0
README.md
CHANGED
|
@@ -1,3 +1,73 @@
|
|
| 1 |
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
license: apache-2.0
|
| 3 |
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
language:
|
| 3 |
+
- orv
|
| 4 |
+
- cu
|
| 5 |
+
tags:
|
| 6 |
+
- masked-language-modeling
|
| 7 |
+
- old-slavonic
|
| 8 |
+
- old-russian
|
| 9 |
+
- birchbark
|
| 10 |
+
- historical-nlp
|
| 11 |
+
- roformer
|
| 12 |
+
- rope
|
| 13 |
+
- bpe
|
| 14 |
license: apache-2.0
|
| 15 |
---
|
| 16 |
+
|
| 17 |
+
# RoFormer-slav
|
| 18 |
+
|
| 19 |
+
A masked language model trained from scratch on Old East Slavic and Old Church Slavonic texts,
|
| 20 |
+
using a RoFormer architecture with BPE tokenisation. Based on [mini-roformer-ancient-rus-v2](https://huggingface.co/AlexSychovUN/mini-roformer-ancient-rus-v2).
|
| 21 |
+
|
| 22 |
+
Note: BPE token boundaries do not always align with lacuna boundaries in editorial markup,
|
| 23 |
+
which inflates span-level CER. For character-level restoration tasks consider using
|
| 24 |
+
[DualEmbLM](https://huggingface.co/MaximEremeev/DualEmb-slav) instead.
|
| 25 |
+
|
| 26 |
+
## Architecture
|
| 27 |
+
|
| 28 |
+
- **Tokenisation**: BPE (Byte Pair Encoding), vocabulary size 50k
|
| 29 |
+
- **Architecture**: RoFormer encoder with Rotary Position Embeddings (RoPE)
|
| 30 |
+
- **Size**: 6 layers, hidden size 512, 8 attention heads
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
## Training
|
| 34 |
+
|
| 35 |
+
The model was trained on a corpus of Old Russian and Church Slavonic texts assembled from the following sources:
|
| 36 |
+
|
| 37 |
+
| Source | Language | Word Tokens | Link |
|
| 38 |
+
|--------|----------|--------|------|
|
| 39 |
+
| Birchbark manuscripts | Old Novgorodian (mostly) | 21,464 | [gramoty.ru](https://gramoty.ru) |
|
| 40 |
+
| Epigraphy | Old Church Slavonic (mostly) | 8,102 | [epigraphica.ru](https://epigraphica.ru) |
|
| 41 |
+
| DIACU | Old Church Slavonic; Church Slavonic (Old Russian, Middle Bulgarian, Serbian, Resava recensions); Middle Russian | 1,683,307 | [ACL Anthology](https://aclanthology.org/2025.bsnlp-1.12/) |
|
| 42 |
+
| TOROT | Old Russian; Church Slavonic | 682,430 | [torottreebank.github.io](https://torottreebank.github.io) |
|
| 43 |
+
| Bible (Ponomar) | Church Slavonic | 603,047 | [GitHub](https://github.com/typiconman/ponomar/tree/master/Ponomar/languages/cu/bible/elis) |
|
| 44 |
+
| Byliny | Old Russian (XI–XVII c.) | 430,103 | [rusneb.ru](https://rusneb.ru/catalog/000199_000009_003636356/) |
|
| 45 |
+
| Pushkin House | Old Russian | 256,503 | [lib2.pushkinskijdom.ru](https://lib2.pushkinskijdom.ru) |
|
| 46 |
+
| Military Statute (Part 2) | Old Russian | 49,787 | [rusneb.ru](https://rusneb.ru/catalog/000199_000009_004093983/) |
|
| 47 |
+
| NKRYA (historical) | Old Russian; Old Rus (XI–XVIII c.) | 42,412 | [ruscorpora.ru](https://ruscorpora.ru) |
|
| 48 |
+
|
| 49 |
+
Masking details: MLM probability 8%, span masking, edge masking, random gap augmentation.
|
| 50 |
+
|
| 51 |
+
## Usage
|
| 52 |
+
|
| 53 |
+
```python
|
| 54 |
+
from transformers import AutoTokenizer, AutoModelForMaskedLM
|
| 55 |
+
|
| 56 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
| 57 |
+
"MaximEremeev/RoFormer-slav",
|
| 58 |
+
trust_remote_code=True,
|
| 59 |
+
)
|
| 60 |
+
model = AutoModelForMaskedLM.from_pretrained(
|
| 61 |
+
"MaximEremeev/RoFormer-slav",
|
| 62 |
+
trust_remote_code=True,
|
| 63 |
+
)
|
| 64 |
+
```
|
| 65 |
+
|
| 66 |
+
## Tasks
|
| 67 |
+
|
| 68 |
+
- **Generated lacunae restoration** (Test A Hit@1: 0.267, CER: 0.839)
|
| 69 |
+
- **Real lacunae restoration** (Test B char Hit@1: 0.158, span Hit@1: 0.063)
|
| 70 |
+
|
| 71 |
+
## Contact
|
| 72 |
+
|
| 73 |
+
Maxim Eremeev, maeremeev@edu.hse.ru
|
aeneas_decode.py
ADDED
|
@@ -0,0 +1,234 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
aeneas_decode.py
|
| 3 |
+
~~~~~~~~~~~~~~~~
|
| 4 |
+
Confidence-ordered beam search, following Assael et al. (2022) «Ithaca/Aeneas».
|
| 5 |
+
|
| 6 |
+
Algorithm per step:
|
| 7 |
+
1. Forward pass — получаем логиты для ВСЕХ оставшихся [MASK] позиций сразу.
|
| 8 |
+
2. Находим позицию с максимальной уверенностью модели
|
| 9 |
+
(argmax по max-probability среди всех масок в этом биме).
|
| 10 |
+
3. Расширяем только эту позицию: берём top-k токенов.
|
| 11 |
+
4. Обрезаем до top-k бимов по суммарному log-probability.
|
| 12 |
+
5. Повторяем пока не останется ни одной маски.
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
from pathlib import Path
|
| 16 |
+
import math
|
| 17 |
+
from dataclasses import dataclass, field
|
| 18 |
+
from typing import List, Optional
|
| 19 |
+
|
| 20 |
+
import torch
|
| 21 |
+
from transformers import PreTrainedModel, PreTrainedTokenizerBase
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
@dataclass
|
| 25 |
+
class Beam:
|
| 26 |
+
input_ids: torch.Tensor # [seq_len]
|
| 27 |
+
log_prob: float = 0.0
|
| 28 |
+
# Список (position, token_id) в порядке заполнения
|
| 29 |
+
filled: List[tuple] = field(default_factory=list)
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def aeneas_beam_search(
|
| 33 |
+
input_ids: torch.Tensor, # [seq_len], уже на нужном device
|
| 34 |
+
model: PreTrainedModel,
|
| 35 |
+
tokenizer: PreTrainedTokenizerBase,
|
| 36 |
+
*,
|
| 37 |
+
beam_width: int = 5,
|
| 38 |
+
temperature: float = 1.0,
|
| 39 |
+
banned_token_ids: Optional[List[int]] = None, # e.g. [GAP_ID]
|
| 40 |
+
) -> List[Beam]:
|
| 41 |
+
"""
|
| 42 |
+
Возвращает `beam_width` бимов, отсортированных по убыванию log-probability.
|
| 43 |
+
"""
|
| 44 |
+
device = input_ids.device
|
| 45 |
+
mask_id = tokenizer.mask_token_id
|
| 46 |
+
banned = set(banned_token_ids or [])
|
| 47 |
+
|
| 48 |
+
# Инициализируем один начальный бим
|
| 49 |
+
beams: List[Beam] = [Beam(input_ids=input_ids.clone())]
|
| 50 |
+
|
| 51 |
+
# Считаем сколько масок нужно заполнить
|
| 52 |
+
n_masks = (input_ids == mask_id).sum().item()
|
| 53 |
+
|
| 54 |
+
with torch.no_grad():
|
| 55 |
+
for _ in range(n_masks):
|
| 56 |
+
candidates: List[Beam] = []
|
| 57 |
+
|
| 58 |
+
for beam in beams:
|
| 59 |
+
mask_positions = (beam.input_ids == mask_id).nonzero(
|
| 60 |
+
as_tuple=True)[0].tolist()
|
| 61 |
+
|
| 62 |
+
if not mask_positions:
|
| 63 |
+
candidates.append(beam)
|
| 64 |
+
continue
|
| 65 |
+
|
| 66 |
+
# ── Forward pass ───────────────────────────────────────────
|
| 67 |
+
logits = model(
|
| 68 |
+
beam.input_ids.unsqueeze(0)
|
| 69 |
+
).logits[0] # [seq_len, vocab]
|
| 70 |
+
|
| 71 |
+
# ── Находим самую уверенную позицию ───────────────────────
|
| 72 |
+
# Для каждой маски берём вероятность наиболее вероятного токена.
|
| 73 |
+
best_pos = None
|
| 74 |
+
best_conf = -1.0
|
| 75 |
+
|
| 76 |
+
for pos in mask_positions:
|
| 77 |
+
pos_logits = logits[pos] / max(temperature, 1e-6)
|
| 78 |
+
if banned:
|
| 79 |
+
pos_logits = pos_logits.clone()
|
| 80 |
+
for tid in banned:
|
| 81 |
+
if tid < pos_logits.shape[-1]:
|
| 82 |
+
pos_logits[tid] = float("-inf")
|
| 83 |
+
max_prob = pos_logits.softmax(dim=-1).max().item()
|
| 84 |
+
if max_prob > best_conf:
|
| 85 |
+
best_conf = max_prob
|
| 86 |
+
best_pos = pos
|
| 87 |
+
|
| 88 |
+
# ── Расширяем именно эту позицию ──────────────────────────
|
| 89 |
+
pos_logits = logits[best_pos] / max(temperature, 1e-6)
|
| 90 |
+
if banned:
|
| 91 |
+
pos_logits = pos_logits.clone()
|
| 92 |
+
for tid in banned:
|
| 93 |
+
if tid < pos_logits.shape[-1]:
|
| 94 |
+
pos_logits[tid] = float("-inf")
|
| 95 |
+
|
| 96 |
+
probs = pos_logits.softmax(dim=-1)
|
| 97 |
+
top_probs, top_ids = probs.topk(beam_width)
|
| 98 |
+
|
| 99 |
+
for prob, token_id in zip(top_probs.tolist(), top_ids.tolist()):
|
| 100 |
+
if prob <= 0:
|
| 101 |
+
continue
|
| 102 |
+
new_ids = beam.input_ids.clone()
|
| 103 |
+
new_ids[best_pos] = token_id
|
| 104 |
+
candidates.append(Beam(
|
| 105 |
+
input_ids = new_ids,
|
| 106 |
+
log_prob = beam.log_prob + math.log(prob + 1e-12),
|
| 107 |
+
filled = beam.filled + [(best_pos, token_id)],
|
| 108 |
+
))
|
| 109 |
+
|
| 110 |
+
# ── Pruning: оставляем top-beam_width бимов ───────────────────
|
| 111 |
+
beams = sorted(candidates, key=lambda b: b.log_prob, reverse=True)
|
| 112 |
+
beams = beams[:beam_width]
|
| 113 |
+
|
| 114 |
+
return beams
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
# ── Вспомогательная ф��нкция: декодирование результатов ─────────────────────────
|
| 118 |
+
|
| 119 |
+
def decode_beams(
|
| 120 |
+
beams: List[Beam],
|
| 121 |
+
original_ids: torch.Tensor,
|
| 122 |
+
tokenizer: PreTrainedTokenizerBase,
|
| 123 |
+
) -> List[dict]:
|
| 124 |
+
"""
|
| 125 |
+
Превращает бимы в читаемый список словарей.
|
| 126 |
+
|
| 127 |
+
Возвращает:
|
| 128 |
+
[
|
| 129 |
+
{
|
| 130 |
+
"text": полностью восстановленный текст,
|
| 131 |
+
"filled_tokens": [(position, token_str), ...] в порядке заполнения,
|
| 132 |
+
"score": нормализованная вероятность (0..1),
|
| 133 |
+
"log_prob": суммарный log-prob,
|
| 134 |
+
},
|
| 135 |
+
...
|
| 136 |
+
]
|
| 137 |
+
"""
|
| 138 |
+
results = []
|
| 139 |
+
# Нормализуем вероятности через softmax по log_prob бимов
|
| 140 |
+
log_probs = torch.tensor([b.log_prob for b in beams], dtype=torch.float)
|
| 141 |
+
scores = log_probs.softmax(dim=0).tolist()
|
| 142 |
+
|
| 143 |
+
for beam, score in zip(beams, scores):
|
| 144 |
+
text = tokenizer.decode(beam.input_ids, skip_special_tokens=True)
|
| 145 |
+
|
| 146 |
+
filled_tokens = [
|
| 147 |
+
(pos, tokenizer.decode([tid], skip_special_tokens=True,
|
| 148 |
+
clean_up_tokenization_spaces=False).strip())
|
| 149 |
+
for pos, tid in beam.filled
|
| 150 |
+
]
|
| 151 |
+
|
| 152 |
+
results.append({
|
| 153 |
+
"text": text,
|
| 154 |
+
"filled_tokens": filled_tokens,
|
| 155 |
+
"score": round(score, 4),
|
| 156 |
+
"log_prob": round(beam.log_prob, 4),
|
| 157 |
+
})
|
| 158 |
+
|
| 159 |
+
return results
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
# ── Высокоуровневый интерфейс ───────────────────────────────────────────────────
|
| 163 |
+
|
| 164 |
+
def restore(
|
| 165 |
+
text: str,
|
| 166 |
+
model: PreTrainedModel,
|
| 167 |
+
tokenizer: PreTrainedTokenizerBase,
|
| 168 |
+
*,
|
| 169 |
+
beam_width: int = 5,
|
| 170 |
+
temperature: float = 1.0,
|
| 171 |
+
gap_token: str = "[GAP]",
|
| 172 |
+
max_length: int = 512,
|
| 173 |
+
) -> List[dict]:
|
| 174 |
+
"""
|
| 175 |
+
Высокоуровневая обёртка: принимает строку с [MASK], возвращает список бимов.
|
| 176 |
+
|
| 177 |
+
Args:
|
| 178 |
+
text: текст с одним или несколькими [MASK] токенами.
|
| 179 |
+
gap_token: токен пропуска — исключается из предсказаний.
|
| 180 |
+
beam_width: число бимов.
|
| 181 |
+
temperature: <1 делает распределение острее, >1 — мягче.
|
| 182 |
+
"""
|
| 183 |
+
device = next(model.parameters()).device
|
| 184 |
+
|
| 185 |
+
enc = tokenizer(
|
| 186 |
+
text,
|
| 187 |
+
return_tensors="pt",
|
| 188 |
+
truncation=True,
|
| 189 |
+
max_length=max_length,
|
| 190 |
+
)
|
| 191 |
+
input_ids = enc["input_ids"][0].to(device)
|
| 192 |
+
|
| 193 |
+
# Исключаем [GAP] из предсказаний
|
| 194 |
+
banned = []
|
| 195 |
+
if gap_token in tokenizer.get_vocab():
|
| 196 |
+
banned.append(tokenizer.convert_tokens_to_ids(gap_token))
|
| 197 |
+
|
| 198 |
+
beams = aeneas_beam_search(
|
| 199 |
+
input_ids, model, tokenizer,
|
| 200 |
+
beam_width=beam_width,
|
| 201 |
+
temperature=temperature,
|
| 202 |
+
banned_token_ids=banned,
|
| 203 |
+
)
|
| 204 |
+
|
| 205 |
+
return decode_beams(beams, input_ids, tokenizer)
|
| 206 |
+
|
| 207 |
+
|
| 208 |
+
# ── CLI / быстрая проверка ─────────────────────────────────────────────────────
|
| 209 |
+
|
| 210 |
+
if __name__ == "__main__":
|
| 211 |
+
import argparse
|
| 212 |
+
from transformers import AutoModelForMaskedLM, AutoTokenizer
|
| 213 |
+
|
| 214 |
+
parser = argparse.ArgumentParser()
|
| 215 |
+
_HERE = Path(__file__).resolve().parent
|
| 216 |
+
parser.add_argument("--model", default=str(_HERE / "outputs/final_model"))
|
| 217 |
+
parser.add_argument("--text", default="поклоне ѿ [MASK] к ѥва про [MASK] ѡкупи")
|
| 218 |
+
parser.add_argument("--top_k", type=int, default=5)
|
| 219 |
+
parser.add_argument("--temp", type=float, default=1.0)
|
| 220 |
+
args = parser.parse_args()
|
| 221 |
+
|
| 222 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 223 |
+
tokenizer = AutoTokenizer.from_pretrained(args.model)
|
| 224 |
+
model = AutoModelForMaskedLM.from_pretrained(args.model).to(device)
|
| 225 |
+
model.eval()
|
| 226 |
+
|
| 227 |
+
print(f"\nВход: {args.text}\n")
|
| 228 |
+
results = restore(args.text, model, tokenizer,
|
| 229 |
+
beam_width=args.top_k, temperature=args.temp)
|
| 230 |
+
|
| 231 |
+
for i, r in enumerate(results, 1):
|
| 232 |
+
print(f" [{i}] score={r['score']:.3f} log_prob={r['log_prob']:.3f}")
|
| 233 |
+
print(f" {r['text']}")
|
| 234 |
+
print(f" заполнено: {r['filled_tokens']}")
|
config.json
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"RoFormerForMaskedLM"
|
| 4 |
+
],
|
| 5 |
+
"attention_probs_dropout_prob": 0.1,
|
| 6 |
+
"embedding_size": 512,
|
| 7 |
+
"hidden_act": "gelu",
|
| 8 |
+
"hidden_dropout_prob": 0.1,
|
| 9 |
+
"hidden_size": 512,
|
| 10 |
+
"initializer_range": 0.02,
|
| 11 |
+
"intermediate_size": 2048,
|
| 12 |
+
"layer_norm_eps": 1e-12,
|
| 13 |
+
"max_position_embeddings": 514,
|
| 14 |
+
"model_type": "roformer",
|
| 15 |
+
"num_attention_heads": 8,
|
| 16 |
+
"num_hidden_layers": 6,
|
| 17 |
+
"pad_token_id": 1,
|
| 18 |
+
"rotary_value": false,
|
| 19 |
+
"torch_dtype": "float32",
|
| 20 |
+
"transformers_version": "4.48.0",
|
| 21 |
+
"type_vocab_size": 2,
|
| 22 |
+
"use_cache": true,
|
| 23 |
+
"vocab_size": 50000,
|
| 24 |
+
"auto_map": {
|
| 25 |
+
"AutoConfig": "configuration_roformer.RoFormerConfig",
|
| 26 |
+
"AutoModelForMaskedLM": "modeling_roformer.RoFormerForMaskedLM"
|
| 27 |
+
}
|
| 28 |
+
}
|
configuration_roformer.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers import RoFormerConfig
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
def get_roformer_config(vocab_size: int, pad_token_id: int):
|
| 5 |
+
"""
|
| 6 |
+
Returns the RoFormer configuration with parameters optimized for Ancient Russian text.
|
| 7 |
+
"""
|
| 8 |
+
return RoFormerConfig(
|
| 9 |
+
vocab_size=vocab_size,
|
| 10 |
+
embedding_size=512,
|
| 11 |
+
hidden_size=512,
|
| 12 |
+
num_hidden_layers=6,
|
| 13 |
+
num_attention_heads=8,
|
| 14 |
+
intermediate_size=2048,
|
| 15 |
+
max_position_embeddings=514,
|
| 16 |
+
pad_token_id=pad_token_id,
|
| 17 |
+
rotary_value=False,
|
| 18 |
+
)
|
model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:52b96dbc3614a8a7964630f05c314e57336c043e79ebff6bc815353a0c0ac270
|
| 3 |
+
size 179464480
|
modeling_roformer.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from config import get_roformer_config
|
| 2 |
+
from transformers import RoFormerForMaskedLM
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
def get_model(vocab_size: int, pad_token_id: int):
|
| 6 |
+
"""
|
| 7 |
+
Initializes and returns a RoFormer model for Masked Language Modeling.
|
| 8 |
+
"""
|
| 9 |
+
config = get_roformer_config(vocab_size, pad_token_id)
|
| 10 |
+
model = RoFormerForMaskedLM(config)
|
| 11 |
+
return model
|
special_tokens_map.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"additional_special_tokens": [
|
| 3 |
+
{
|
| 4 |
+
"content": "[GAP]",
|
| 5 |
+
"lstrip": false,
|
| 6 |
+
"normalized": false,
|
| 7 |
+
"rstrip": false,
|
| 8 |
+
"single_word": false
|
| 9 |
+
}
|
| 10 |
+
],
|
| 11 |
+
"bos_token": "<s>",
|
| 12 |
+
"cls_token": "<s>",
|
| 13 |
+
"eos_token": "</s>",
|
| 14 |
+
"mask_token": "<mask>",
|
| 15 |
+
"pad_token": "<pad>",
|
| 16 |
+
"sep_token": "</s>",
|
| 17 |
+
"unk_token": "<unk>"
|
| 18 |
+
}
|
tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tokenizer_config.json
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"added_tokens_decoder": {
|
| 3 |
+
"0": {
|
| 4 |
+
"content": "<s>",
|
| 5 |
+
"lstrip": false,
|
| 6 |
+
"normalized": false,
|
| 7 |
+
"rstrip": false,
|
| 8 |
+
"single_word": false,
|
| 9 |
+
"special": true
|
| 10 |
+
},
|
| 11 |
+
"1": {
|
| 12 |
+
"content": "<pad>",
|
| 13 |
+
"lstrip": false,
|
| 14 |
+
"normalized": false,
|
| 15 |
+
"rstrip": false,
|
| 16 |
+
"single_word": false,
|
| 17 |
+
"special": true
|
| 18 |
+
},
|
| 19 |
+
"2": {
|
| 20 |
+
"content": "</s>",
|
| 21 |
+
"lstrip": false,
|
| 22 |
+
"normalized": false,
|
| 23 |
+
"rstrip": false,
|
| 24 |
+
"single_word": false,
|
| 25 |
+
"special": true
|
| 26 |
+
},
|
| 27 |
+
"3": {
|
| 28 |
+
"content": "<unk>",
|
| 29 |
+
"lstrip": false,
|
| 30 |
+
"normalized": false,
|
| 31 |
+
"rstrip": false,
|
| 32 |
+
"single_word": false,
|
| 33 |
+
"special": true
|
| 34 |
+
},
|
| 35 |
+
"4": {
|
| 36 |
+
"content": "<mask>",
|
| 37 |
+
"lstrip": false,
|
| 38 |
+
"normalized": false,
|
| 39 |
+
"rstrip": false,
|
| 40 |
+
"single_word": false,
|
| 41 |
+
"special": true
|
| 42 |
+
},
|
| 43 |
+
"5": {
|
| 44 |
+
"content": "[GAP]",
|
| 45 |
+
"lstrip": false,
|
| 46 |
+
"normalized": false,
|
| 47 |
+
"rstrip": false,
|
| 48 |
+
"single_word": false,
|
| 49 |
+
"special": true
|
| 50 |
+
}
|
| 51 |
+
},
|
| 52 |
+
"additional_special_tokens": [
|
| 53 |
+
"[GAP]"
|
| 54 |
+
],
|
| 55 |
+
"bos_token": "<s>",
|
| 56 |
+
"clean_up_tokenization_spaces": false,
|
| 57 |
+
"cls_token": "<s>",
|
| 58 |
+
"eos_token": "</s>",
|
| 59 |
+
"extra_special_tokens": {},
|
| 60 |
+
"mask_token": "<mask>",
|
| 61 |
+
"max_len": 512,
|
| 62 |
+
"model_max_length": 512,
|
| 63 |
+
"pad_token": "<pad>",
|
| 64 |
+
"sep_token": "</s>",
|
| 65 |
+
"tokenizer_class": "PreTrainedTokenizerFast",
|
| 66 |
+
"unk_token": "<unk>"
|
| 67 |
+
}
|