Add DualEmbLM

Browse files

Files changed (9) hide show

README.md +54 -0
align_dual.py +87 -0
char_vocab.json +158 -0
config.json +31 -0
configuration_dual.py +38 -0
embeddings.py +39 -0
model.safetensors +3 -0
modeling_dual.py +83 -0
word_vocab.json +0 -0

README.md CHANGED Viewed

@@ -1,3 +1,57 @@
 ---
 license: apache-2.0
 ---

 ---
+language: orv
+tags:
+  - masked-language-modeling
+  - old-slavonic
+  - old-russian
+  - birchbark
+  - historical-nlp
+  - dual-embeddings
 license: apache-2.0
 ---
+# DualEmbLM
+A masked language model trained from scratch on Old East Slavic and Old Church Slavonic texts,
+with dual character-level + word-level embeddings.
+## Architecture
+DualEmbLM combines:
+- **Character-level tokenisation** (1 character = 1 token) — enables precise lacuna restoration at the character level
+- **Word-level context embeddings** — provides morphological and lexical context via a 50k word vocabulary
+- **Transformer encoder** (BERT architecture, trained from scratch) — 6 layers, hidden size 512, 8 attention heads
+The dual embeddings are concatenated and projected into the shared
+hidden space before being passed to the transformer encoder.
+## Training
+Trained on a corpus of  (MLM probability 8%, span masking, edge masking, random gap augmentation).
+## Usage
+```python
+from transformers import AutoModelForMaskedLM
+model = AutoModelForMaskedLM.from_pretrained(
+    "your-username/novgorodets",
+    trust_remote_code=True,
+)
+```
+## Tasks
+- **Lacuna restoration** (Test A Hit@1: 0.817, CER: 0.183)
+- **Real gap restoration** (Test B char Hit@1: 0.466, span Hit@1: 0.222)
+## Citation
+If you use this model, please cite:
+```
+@mastersthesis{...,
+  title  = {Automatic Restoration and Analysis of Birchbark Manuscripts},
+  author = {Maxim Eremeev},
+  year   = {2026},
+}
+```

align_dual.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import json
+import re
+from pathlib import Path
+SPECIAL_RE = re.compile(
+    r"(\[CTX_[A-Z_]+\]|\[GAP\]|\[MASK\]|\[PAD\]|\[UNK\]|\[CLS\]|\[SEP\]|[+:·])"
+)
+def load_vocab(path: str | Path) -> dict[str, int]:
+    return json.loads(Path(path).read_text(encoding="utf-8"))
+def split_special(text: str) -> list[str]:
+    return [p for p in SPECIAL_RE.split(text) if p]
+def align_char_to_word(
+    text: str,
+    char_vocab: dict[str, int],
+    word_vocab: dict[str, int],
+    max_len: int = 256,
+    add_cls_sep: bool = True,
+):
+    char_unk = char_vocab["[UNK]"]
+    char_pad = char_vocab["[PAD]"]
+    char_cls = char_vocab["[CLS]"]
+    char_sep = char_vocab["[SEP]"]
+    word_unk = word_vocab["[UNK_WORD]"]
+    word_pad = word_vocab["[PAD_WORD]"]
+    special_char_ids = {char_vocab[t] for t in char_vocab if t.startswith("[") and t.endswith("]")}
+    input_ids = []
+    word_ids = []
+    if add_cls_sep:
+        input_ids.append(char_cls)
+        word_ids.append(word_vocab.get("[CLS]", word_unk))
+    for part in split_special(text.strip()):
+        if SPECIAL_RE.fullmatch(part):
+            input_ids.append(char_vocab.get(part, char_unk))
+            word_ids.append(word_vocab.get(part, word_unk))
+            continue
+        chunks = re.split(r"(\s+)", part)
+        for chunk in chunks:
+            if not chunk:
+                continue
+            if chunk.isspace():
+                for ch in chunk:
+                    input_ids.append(char_vocab.get(ch, char_unk))
+                    word_ids.append(word_unk)
+            else:
+                wid = word_vocab.get(chunk, word_unk)
+                for ch in chunk:
+                    input_ids.append(char_vocab.get(ch, char_unk))
+                    word_ids.append(wid)
+    if add_cls_sep:
+        input_ids.append(char_sep)
+        word_ids.append(word_vocab.get("[SEP]", word_unk))
+    if len(input_ids) > max_len:
+        input_ids = input_ids[:max_len]
+        word_ids = word_ids[:max_len]
+        if add_cls_sep:
+            input_ids[-1] = char_sep
+            word_ids[-1] = word_vocab.get("[SEP]", word_unk)
+    attention_mask = [1] * len(input_ids)
+    special_tokens_mask = [1 if tid in special_char_ids else 0 for tid in input_ids]
+    pad_len = max_len - len(input_ids)
+    if pad_len > 0:
+        input_ids.extend([char_pad] * pad_len)
+        word_ids.extend([word_pad] * pad_len)
+        attention_mask.extend([0] * pad_len)
+        special_tokens_mask.extend([1] * pad_len)
+    return {
+        "input_ids": input_ids,
+        "word_ids": word_ids,
+        "attention_mask": attention_mask,
+        "special_tokens_mask": special_tokens_mask,
+    }

char_vocab.json ADDED Viewed

	@@ -0,0 +1,158 @@

+{
+  "[PAD]": 0,
+  "[UNK]": 1,
+  "[CLS]": 2,
+  "[SEP]": 3,
+  "[MASK]": 4,
+  "[GAP]": 5,
+  " ": 6,
+  "0": 7,
+  "1": 8,
+  "2": 9,
+  "3": 10,
+  "4": 11,
+  "5": 12,
+  "6": 13,
+  "7": 14,
+  "8": 15,
+  "9": 16,
+  "[": 17,
+  "]": 18,
+  "d": 19,
+  "f": 20,
+  "g": 21,
+  "h": 22,
+  "j": 23,
+  "l": 24,
+  "q": 25,
+  "r": 26,
+  "u": 27,
+  "v": 28,
+  "z": 29,
+  "º": 30,
+  "á": 31,
+  "â": 32,
+  "é": 33,
+  "í": 34,
+  "î": 35,
+  "ï": 36,
+  "ó": 37,
+  "ý": 38,
+  "ă": 39,
+  "ı": 40,
+  "ł": 41,
+  "ŕ": 42,
+  "ş": 43,
+  "ţ": 44,
+  "έ": 45,
+  "ή": 46,
+  "ί": 47,
+  "α": 48,
+  "β": 49,
+  "γ": 50,
+  "δ": 51,
+  "ε": 52,
+  "ζ": 53,
+  "η": 54,
+  "θ": 55,
+  "ι": 56,
+  "κ": 57,
+  "λ": 58,
+  "μ": 59,
+  "ξ": 60,
+  "ο": 61,
+  "π": 62,
+  "ρ": 63,
+  "ς": 64,
+  "σ": 65,
+  "τ": 66,
+  "υ": 67,
+  "φ": 68,
+  "χ": 69,
+  "ψ": 70,
+  "ϊ": 71,
+  "ό": 72,
+  "ώ": 73,
+  "Е": 74,
+  "М": 75,
+  "О": 76,
+  "П": 77,
+  "Р": 78,
+  "С": 79,
+  "а": 80,
+  "б": 81,
+  "в": 82,
+  "г": 83,
+  "д": 84,
+  "е": 85,
+  "ж": 86,
+  "з": 87,
+  "и": 88,
+  "й": 89,
+  "к": 90,
+  "л": 91,
+  "м": 92,
+  "н": 93,
+  "о": 94,
+  "п": 95,
+  "р": 96,
+  "с": 97,
+  "т": 98,
+  "у": 99,
+  "ф": 100,
+  "х": 101,
+  "ц": 102,
+  "ч": 103,
+  "ш": 104,
+  "щ": 105,
+  "ъ": 106,
+  "ы": 107,
+  "ь": 108,
+  "э": 109,
+  "ю": 110,
+  "я": 111,
+  "ѐ": 112,
+  "ё": 113,
+  "ђ": 114,
+  "ѓ": 115,
+  "є": 116,
+  "ѕ": 117,
+  "і": 118,
+  "ї": 119,
+  "ћ": 120,
+  "ќ": 121,
+  "ѝ": 122,
+  "ў": 123,
+  "џ": 124,
+  "ѡ": 125,
+  "ѣ": 126,
+  "ѥ": 127,
+  "ѧ": 128,
+  "ѩ": 129,
+  "ѫ": 130,
+  "ѭ": 131,
+  "ѯ": 132,
+  "ѱ": 133,
+  "ѳ": 134,
+  "ѵ": 135,
+  "ѹ": 136,
+  "ѿ": 137,
+  "҃": 138,
+  "ґ": 139,
+  "ӏ": 140,
+  "ӣ": 141,
+  "ӳ": 142,
+  "ἀ": 143,
+  "ὰ": 144,
+  "ὲ": 145,
+  "ὴ": 146,
+  "ὶ": 147,
+  "ὸ": 148,
+  "ὺ": 149,
+  "ꙁ": 150,
+  "ꙃ": 151,
+  "ꙋ": 152,
+  "ꙑ": 153,
+  "ꙗ": 154,
+  "ꙩ": 155
+}

config.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "architectures": [
+    "DualBertForMaskedLM"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "classifier_dropout": null,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 512,
+  "initializer_range": 0.02,
+  "intermediate_size": 2048,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 256,
+  "model_type": "dual_bert",
+  "num_attention_heads": 8,
+  "num_hidden_layers": 6,
+  "pad_token_id": 0,
+  "position_embedding_type": "absolute",
+  "torch_dtype": "float32",
+  "transformers_version": "4.48.0",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_char_size": 156,
+  "vocab_size": 156,
+  "vocab_word_size": 50000,
+  "word_char_emb_dim": 192,
+  "auto_map": {
+    "AutoConfig": "configuration_dual.DualBertConfig",
+    "AutoModelForMaskedLM": "modeling_dual.DualBertForMaskedLM"
+  }
+}

configuration_dual.py ADDED Viewed

	@@ -0,0 +1,38 @@

+from transformers import BertConfig
+class DualBertConfig(BertConfig):
+    model_type = "dual_bert"
+    def __init__(
+        self,
+        vocab_char_size: int = 256,
+        vocab_word_size: int = 50000,
+        word_char_emb_dim: int = 192,
+        hidden_size: int = 512,
+        num_hidden_layers: int = 6,
+        num_attention_heads: int = 8,
+        intermediate_size: int = 2048,
+        max_position_embeddings: int = 512,
+        hidden_dropout_prob: float = 0.1,
+        attention_probs_dropout_prob: float = 0.1,
+        **kwargs,
+    ):
+        # HuggingFace passes vocab_size via kwargs when loading from config.json;
+        # remove it to avoid conflict with our explicit vocab_size=vocab_char_size.
+        kwargs.pop("vocab_size", None)
+        super().__init__(
+            vocab_size=vocab_char_size,
+            hidden_size=hidden_size,
+            num_hidden_layers=num_hidden_layers,
+            num_attention_heads=num_attention_heads,
+            intermediate_size=intermediate_size,
+            max_position_embeddings=max_position_embeddings,
+            hidden_dropout_prob=hidden_dropout_prob,
+            attention_probs_dropout_prob=attention_probs_dropout_prob,
+            **kwargs,
+        )
+        self.vocab_char_size  = vocab_char_size
+        self.vocab_word_size  = vocab_word_size
+        self.word_char_emb_dim = word_char_emb_dim

embeddings.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import torch
+import torch.nn as nn
+class DualEmbeddings(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        d = config.word_char_emb_dim
+        self.char_embeddings = nn.Embedding(
+            config.vocab_char_size, d, padding_idx=config.pad_token_id
+        )
+        self.word_embeddings = nn.Embedding(
+            config.vocab_word_size, d, padding_idx=0
+        )
+        self.projection = nn.Linear(2 * d, config.hidden_size, bias=False)
+        self.position_embeddings = nn.Embedding(
+            config.max_position_embeddings, config.hidden_size
+        )
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).unsqueeze(0), persistent=False
+        )
+    def forward(self, input_ids, word_ids):
+        bsz, seq_len = input_ids.shape
+        pos_ids = self.position_ids[:, :seq_len]
+        c = self.char_embeddings(input_ids)
+        w = self.word_embeddings(word_ids)
+        x = torch.cat([c, w], dim=-1)
+        x = self.projection(x)
+        x = x + self.position_embeddings(pos_ids)
+        x = self.layer_norm(x)
+        x = self.dropout(x)
+        return x

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e44e2787dc9518bcf8a3c1efcd7a8cad8639cd0a94ca89ec93ba80382b00ec07
+size 115899720

modeling_dual.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import torch
+import torch.nn as nn
+from transformers import BertPreTrainedModel
+from transformers.modeling_outputs import MaskedLMOutput
+from transformers.models.bert.modeling_bert import BertEncoder
+from config import DualBertConfig
+from embeddings import DualEmbeddings
+class DualBertForMaskedLM(BertPreTrainedModel):
+    config_class = DualBertConfig
+    def __init__(self, config: DualBertConfig):
+        super().__init__(config)
+        self.dual_embeddings = DualEmbeddings(config)
+        self.encoder = BertEncoder(config)
+        self.mlm_dense = nn.Linear(config.hidden_size, config.word_char_emb_dim)
+        self.mlm_act = nn.GELU()
+        self.mlm_norm = nn.LayerNorm(config.word_char_emb_dim, eps=config.layer_norm_eps)
+        self.mlm_bias = nn.Parameter(torch.zeros(config.vocab_char_size))
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.dual_embeddings.char_embeddings
+    def set_input_embeddings(self, value):
+        self.dual_embeddings.char_embeddings = value
+    def forward(
+        self,
+        input_ids=None,
+        word_ids=None,
+        attention_mask=None,
+        labels=None,
+        return_dict=True,
+        **kwargs,
+    ):
+        if input_ids is None or word_ids is None:
+            raise ValueError("Both input_ids and word_ids are required.")
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids, dtype=torch.long)
+        emb = self.dual_embeddings(input_ids=input_ids, word_ids=word_ids)
+        ext_mask = self.get_extended_attention_mask(attention_mask, input_ids.shape, input_ids.device)
+        enc_out = self.encoder(
+            emb,
+            attention_mask=ext_mask,
+            head_mask=[None] * self.config.num_hidden_layers,
+            return_dict=True,
+        )
+        seq = enc_out.last_hidden_state
+        x = self.mlm_dense(seq)
+        x = self.mlm_act(x)
+        x = self.mlm_norm(x)
+        char_emb = self.dual_embeddings.char_embeddings.weight
+        logits = x @ char_emb.T + self.mlm_bias
+        logits = x @ char_emb.T + self.mlm_bias
+        # DEBUG: мониторим норму эмбеддингов
+        if torch.isnan(logits).any() or torch.isinf(logits).any():
+            emb_norm = self.dual_embeddings.char_embeddings.weight.norm()
+            x_norm = x.norm()
+            raise RuntimeError(
+                f"NaN/Inf in logits! char_emb_norm={emb_norm:.2f}, x_norm={x_norm:.2f}"
+            )
+        loss = None
+        if labels is not None:
+            loss_fct = nn.CrossEntropyLoss(ignore_index=-100, label_smoothing=0.1)
+            loss = loss_fct(logits.view(-1, self.config.vocab_char_size), labels.view(-1))
+        if not return_dict:
+            return (loss, logits) if loss is not None else (logits,)
+        return MaskedLMOutput(loss=loss, logits=logits)

word_vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff