MaximEremeev commited on
Commit
3bfb9a0
·
verified ·
1 Parent(s): c5cd2b8

Add DualEmbLM

Browse files
Files changed (9) hide show
  1. README.md +54 -0
  2. align_dual.py +87 -0
  3. char_vocab.json +158 -0
  4. config.json +31 -0
  5. configuration_dual.py +38 -0
  6. embeddings.py +39 -0
  7. model.safetensors +3 -0
  8. modeling_dual.py +83 -0
  9. word_vocab.json +0 -0
README.md CHANGED
@@ -1,3 +1,57 @@
1
  ---
 
 
 
 
 
 
 
 
2
  license: apache-2.0
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ language: orv
3
+ tags:
4
+ - masked-language-modeling
5
+ - old-slavonic
6
+ - old-russian
7
+ - birchbark
8
+ - historical-nlp
9
+ - dual-embeddings
10
  license: apache-2.0
11
  ---
12
+
13
+ # DualEmbLM
14
+
15
+ A masked language model trained from scratch on Old East Slavic and Old Church Slavonic texts,
16
+ with dual character-level + word-level embeddings.
17
+
18
+ ## Architecture
19
+
20
+ DualEmbLM combines:
21
+ - **Character-level tokenisation** (1 character = 1 token) — enables precise lacuna restoration at the character level
22
+ - **Word-level context embeddings** — provides morphological and lexical context via a 50k word vocabulary
23
+ - **Transformer encoder** (BERT architecture, trained from scratch) — 6 layers, hidden size 512, 8 attention heads
24
+
25
+ The dual embeddings are concatenated and projected into the shared
26
+ hidden space before being passed to the transformer encoder.
27
+
28
+ ## Training
29
+
30
+ Trained on a corpus of (MLM probability 8%, span masking, edge masking, random gap augmentation).
31
+
32
+ ## Usage
33
+
34
+ ```python
35
+ from transformers import AutoModelForMaskedLM
36
+
37
+ model = AutoModelForMaskedLM.from_pretrained(
38
+ "your-username/novgorodets",
39
+ trust_remote_code=True,
40
+ )
41
+ ```
42
+
43
+ ## Tasks
44
+
45
+ - **Lacuna restoration** (Test A Hit@1: 0.817, CER: 0.183)
46
+ - **Real gap restoration** (Test B char Hit@1: 0.466, span Hit@1: 0.222)
47
+
48
+ ## Citation
49
+
50
+ If you use this model, please cite:
51
+ ```
52
+ @mastersthesis{...,
53
+ title = {Automatic Restoration and Analysis of Birchbark Manuscripts},
54
+ author = {Maxim Eremeev},
55
+ year = {2026},
56
+ }
57
+ ```
align_dual.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import re
3
+ from pathlib import Path
4
+
5
+ SPECIAL_RE = re.compile(
6
+ r"(\[CTX_[A-Z_]+\]|\[GAP\]|\[MASK\]|\[PAD\]|\[UNK\]|\[CLS\]|\[SEP\]|[+:·])"
7
+ )
8
+
9
+
10
+ def load_vocab(path: str | Path) -> dict[str, int]:
11
+ return json.loads(Path(path).read_text(encoding="utf-8"))
12
+
13
+
14
+ def split_special(text: str) -> list[str]:
15
+ return [p for p in SPECIAL_RE.split(text) if p]
16
+
17
+
18
+ def align_char_to_word(
19
+ text: str,
20
+ char_vocab: dict[str, int],
21
+ word_vocab: dict[str, int],
22
+ max_len: int = 256,
23
+ add_cls_sep: bool = True,
24
+ ):
25
+ char_unk = char_vocab["[UNK]"]
26
+ char_pad = char_vocab["[PAD]"]
27
+ char_cls = char_vocab["[CLS]"]
28
+ char_sep = char_vocab["[SEP]"]
29
+
30
+ word_unk = word_vocab["[UNK_WORD]"]
31
+ word_pad = word_vocab["[PAD_WORD]"]
32
+
33
+ special_char_ids = {char_vocab[t] for t in char_vocab if t.startswith("[") and t.endswith("]")}
34
+ input_ids = []
35
+ word_ids = []
36
+
37
+ if add_cls_sep:
38
+ input_ids.append(char_cls)
39
+ word_ids.append(word_vocab.get("[CLS]", word_unk))
40
+
41
+ for part in split_special(text.strip()):
42
+ if SPECIAL_RE.fullmatch(part):
43
+ input_ids.append(char_vocab.get(part, char_unk))
44
+ word_ids.append(word_vocab.get(part, word_unk))
45
+ continue
46
+
47
+ chunks = re.split(r"(\s+)", part)
48
+ for chunk in chunks:
49
+ if not chunk:
50
+ continue
51
+ if chunk.isspace():
52
+ for ch in chunk:
53
+ input_ids.append(char_vocab.get(ch, char_unk))
54
+ word_ids.append(word_unk)
55
+ else:
56
+ wid = word_vocab.get(chunk, word_unk)
57
+ for ch in chunk:
58
+ input_ids.append(char_vocab.get(ch, char_unk))
59
+ word_ids.append(wid)
60
+
61
+ if add_cls_sep:
62
+ input_ids.append(char_sep)
63
+ word_ids.append(word_vocab.get("[SEP]", word_unk))
64
+
65
+ if len(input_ids) > max_len:
66
+ input_ids = input_ids[:max_len]
67
+ word_ids = word_ids[:max_len]
68
+ if add_cls_sep:
69
+ input_ids[-1] = char_sep
70
+ word_ids[-1] = word_vocab.get("[SEP]", word_unk)
71
+
72
+ attention_mask = [1] * len(input_ids)
73
+ special_tokens_mask = [1 if tid in special_char_ids else 0 for tid in input_ids]
74
+
75
+ pad_len = max_len - len(input_ids)
76
+ if pad_len > 0:
77
+ input_ids.extend([char_pad] * pad_len)
78
+ word_ids.extend([word_pad] * pad_len)
79
+ attention_mask.extend([0] * pad_len)
80
+ special_tokens_mask.extend([1] * pad_len)
81
+
82
+ return {
83
+ "input_ids": input_ids,
84
+ "word_ids": word_ids,
85
+ "attention_mask": attention_mask,
86
+ "special_tokens_mask": special_tokens_mask,
87
+ }
char_vocab.json ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "[PAD]": 0,
3
+ "[UNK]": 1,
4
+ "[CLS]": 2,
5
+ "[SEP]": 3,
6
+ "[MASK]": 4,
7
+ "[GAP]": 5,
8
+ " ": 6,
9
+ "0": 7,
10
+ "1": 8,
11
+ "2": 9,
12
+ "3": 10,
13
+ "4": 11,
14
+ "5": 12,
15
+ "6": 13,
16
+ "7": 14,
17
+ "8": 15,
18
+ "9": 16,
19
+ "[": 17,
20
+ "]": 18,
21
+ "d": 19,
22
+ "f": 20,
23
+ "g": 21,
24
+ "h": 22,
25
+ "j": 23,
26
+ "l": 24,
27
+ "q": 25,
28
+ "r": 26,
29
+ "u": 27,
30
+ "v": 28,
31
+ "z": 29,
32
+ "º": 30,
33
+ "á": 31,
34
+ "â": 32,
35
+ "é": 33,
36
+ "í": 34,
37
+ "î": 35,
38
+ "ï": 36,
39
+ "ó": 37,
40
+ "ý": 38,
41
+ "ă": 39,
42
+ "ı": 40,
43
+ "ł": 41,
44
+ "ŕ": 42,
45
+ "ş": 43,
46
+ "ţ": 44,
47
+ "έ": 45,
48
+ "ή": 46,
49
+ "ί": 47,
50
+ "α": 48,
51
+ "β": 49,
52
+ "γ": 50,
53
+ "δ": 51,
54
+ "ε": 52,
55
+ "ζ": 53,
56
+ "η": 54,
57
+ "θ": 55,
58
+ "ι": 56,
59
+ "κ": 57,
60
+ "λ": 58,
61
+ "μ": 59,
62
+ "ξ": 60,
63
+ "ο": 61,
64
+ "π": 62,
65
+ "ρ": 63,
66
+ "ς": 64,
67
+ "σ": 65,
68
+ "τ": 66,
69
+ "υ": 67,
70
+ "φ": 68,
71
+ "χ": 69,
72
+ "ψ": 70,
73
+ "ϊ": 71,
74
+ "ό": 72,
75
+ "ώ": 73,
76
+ "Е": 74,
77
+ "М": 75,
78
+ "О": 76,
79
+ "П": 77,
80
+ "Р": 78,
81
+ "С": 79,
82
+ "а": 80,
83
+ "б": 81,
84
+ "в": 82,
85
+ "г": 83,
86
+ "д": 84,
87
+ "е": 85,
88
+ "ж": 86,
89
+ "з": 87,
90
+ "и": 88,
91
+ "й": 89,
92
+ "к": 90,
93
+ "л": 91,
94
+ "м": 92,
95
+ "н": 93,
96
+ "о": 94,
97
+ "п": 95,
98
+ "р": 96,
99
+ "с": 97,
100
+ "т": 98,
101
+ "у": 99,
102
+ "ф": 100,
103
+ "х": 101,
104
+ "ц": 102,
105
+ "ч": 103,
106
+ "ш": 104,
107
+ "щ": 105,
108
+ "ъ": 106,
109
+ "ы": 107,
110
+ "ь": 108,
111
+ "э": 109,
112
+ "ю": 110,
113
+ "я": 111,
114
+ "ѐ": 112,
115
+ "ё": 113,
116
+ "ђ": 114,
117
+ "ѓ": 115,
118
+ "є": 116,
119
+ "ѕ": 117,
120
+ "і": 118,
121
+ "ї": 119,
122
+ "ћ": 120,
123
+ "ќ": 121,
124
+ "ѝ": 122,
125
+ "ў": 123,
126
+ "џ": 124,
127
+ "ѡ": 125,
128
+ "ѣ": 126,
129
+ "ѥ": 127,
130
+ "ѧ": 128,
131
+ "ѩ": 129,
132
+ "ѫ": 130,
133
+ "ѭ": 131,
134
+ "ѯ": 132,
135
+ "ѱ": 133,
136
+ "ѳ": 134,
137
+ "ѵ": 135,
138
+ "ѹ": 136,
139
+ "ѿ": 137,
140
+ "҃": 138,
141
+ "ґ": 139,
142
+ "ӏ": 140,
143
+ "ӣ": 141,
144
+ "ӳ": 142,
145
+ "ἀ": 143,
146
+ "ὰ": 144,
147
+ "ὲ": 145,
148
+ "ὴ": 146,
149
+ "ὶ": 147,
150
+ "ὸ": 148,
151
+ "ὺ": 149,
152
+ "ꙁ": 150,
153
+ "ꙃ": 151,
154
+ "ꙋ": 152,
155
+ "ꙑ": 153,
156
+ "ꙗ": 154,
157
+ "ꙩ": 155
158
+ }
config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "DualBertForMaskedLM"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "classifier_dropout": null,
7
+ "hidden_act": "gelu",
8
+ "hidden_dropout_prob": 0.1,
9
+ "hidden_size": 512,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 2048,
12
+ "layer_norm_eps": 1e-12,
13
+ "max_position_embeddings": 256,
14
+ "model_type": "dual_bert",
15
+ "num_attention_heads": 8,
16
+ "num_hidden_layers": 6,
17
+ "pad_token_id": 0,
18
+ "position_embedding_type": "absolute",
19
+ "torch_dtype": "float32",
20
+ "transformers_version": "4.48.0",
21
+ "type_vocab_size": 2,
22
+ "use_cache": true,
23
+ "vocab_char_size": 156,
24
+ "vocab_size": 156,
25
+ "vocab_word_size": 50000,
26
+ "word_char_emb_dim": 192,
27
+ "auto_map": {
28
+ "AutoConfig": "configuration_dual.DualBertConfig",
29
+ "AutoModelForMaskedLM": "modeling_dual.DualBertForMaskedLM"
30
+ }
31
+ }
configuration_dual.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import BertConfig
2
+
3
+
4
+ class DualBertConfig(BertConfig):
5
+ model_type = "dual_bert"
6
+
7
+ def __init__(
8
+ self,
9
+ vocab_char_size: int = 256,
10
+ vocab_word_size: int = 50000,
11
+ word_char_emb_dim: int = 192,
12
+ hidden_size: int = 512,
13
+ num_hidden_layers: int = 6,
14
+ num_attention_heads: int = 8,
15
+ intermediate_size: int = 2048,
16
+ max_position_embeddings: int = 512,
17
+ hidden_dropout_prob: float = 0.1,
18
+ attention_probs_dropout_prob: float = 0.1,
19
+ **kwargs,
20
+ ):
21
+ # HuggingFace passes vocab_size via kwargs when loading from config.json;
22
+ # remove it to avoid conflict with our explicit vocab_size=vocab_char_size.
23
+ kwargs.pop("vocab_size", None)
24
+
25
+ super().__init__(
26
+ vocab_size=vocab_char_size,
27
+ hidden_size=hidden_size,
28
+ num_hidden_layers=num_hidden_layers,
29
+ num_attention_heads=num_attention_heads,
30
+ intermediate_size=intermediate_size,
31
+ max_position_embeddings=max_position_embeddings,
32
+ hidden_dropout_prob=hidden_dropout_prob,
33
+ attention_probs_dropout_prob=attention_probs_dropout_prob,
34
+ **kwargs,
35
+ )
36
+ self.vocab_char_size = vocab_char_size
37
+ self.vocab_word_size = vocab_word_size
38
+ self.word_char_emb_dim = word_char_emb_dim
embeddings.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+
4
+
5
+ class DualEmbeddings(nn.Module):
6
+ def __init__(self, config):
7
+ super().__init__()
8
+ d = config.word_char_emb_dim
9
+
10
+ self.char_embeddings = nn.Embedding(
11
+ config.vocab_char_size, d, padding_idx=config.pad_token_id
12
+ )
13
+ self.word_embeddings = nn.Embedding(
14
+ config.vocab_word_size, d, padding_idx=0
15
+ )
16
+ self.projection = nn.Linear(2 * d, config.hidden_size, bias=False)
17
+ self.position_embeddings = nn.Embedding(
18
+ config.max_position_embeddings, config.hidden_size
19
+ )
20
+ self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
21
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
22
+
23
+ self.register_buffer(
24
+ "position_ids", torch.arange(config.max_position_embeddings).unsqueeze(0), persistent=False
25
+ )
26
+
27
+ def forward(self, input_ids, word_ids):
28
+ bsz, seq_len = input_ids.shape
29
+ pos_ids = self.position_ids[:, :seq_len]
30
+
31
+ c = self.char_embeddings(input_ids)
32
+ w = self.word_embeddings(word_ids)
33
+
34
+ x = torch.cat([c, w], dim=-1)
35
+ x = self.projection(x)
36
+ x = x + self.position_embeddings(pos_ids)
37
+ x = self.layer_norm(x)
38
+ x = self.dropout(x)
39
+ return x
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e44e2787dc9518bcf8a3c1efcd7a8cad8639cd0a94ca89ec93ba80382b00ec07
3
+ size 115899720
modeling_dual.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ from transformers import BertPreTrainedModel
4
+ from transformers.modeling_outputs import MaskedLMOutput
5
+ from transformers.models.bert.modeling_bert import BertEncoder
6
+
7
+ from config import DualBertConfig
8
+ from embeddings import DualEmbeddings
9
+
10
+
11
+ class DualBertForMaskedLM(BertPreTrainedModel):
12
+ config_class = DualBertConfig
13
+
14
+ def __init__(self, config: DualBertConfig):
15
+ super().__init__(config)
16
+ self.dual_embeddings = DualEmbeddings(config)
17
+ self.encoder = BertEncoder(config)
18
+
19
+ self.mlm_dense = nn.Linear(config.hidden_size, config.word_char_emb_dim)
20
+ self.mlm_act = nn.GELU()
21
+ self.mlm_norm = nn.LayerNorm(config.word_char_emb_dim, eps=config.layer_norm_eps)
22
+ self.mlm_bias = nn.Parameter(torch.zeros(config.vocab_char_size))
23
+
24
+ self.post_init()
25
+
26
+ def get_input_embeddings(self):
27
+ return self.dual_embeddings.char_embeddings
28
+
29
+ def set_input_embeddings(self, value):
30
+ self.dual_embeddings.char_embeddings = value
31
+
32
+ def forward(
33
+ self,
34
+ input_ids=None,
35
+ word_ids=None,
36
+ attention_mask=None,
37
+ labels=None,
38
+ return_dict=True,
39
+ **kwargs,
40
+ ):
41
+ if input_ids is None or word_ids is None:
42
+ raise ValueError("Both input_ids and word_ids are required.")
43
+
44
+ if attention_mask is None:
45
+ attention_mask = torch.ones_like(input_ids, dtype=torch.long)
46
+
47
+ emb = self.dual_embeddings(input_ids=input_ids, word_ids=word_ids)
48
+ ext_mask = self.get_extended_attention_mask(attention_mask, input_ids.shape, input_ids.device)
49
+
50
+ enc_out = self.encoder(
51
+ emb,
52
+ attention_mask=ext_mask,
53
+ head_mask=[None] * self.config.num_hidden_layers,
54
+ return_dict=True,
55
+ )
56
+ seq = enc_out.last_hidden_state
57
+
58
+ x = self.mlm_dense(seq)
59
+ x = self.mlm_act(x)
60
+ x = self.mlm_norm(x)
61
+
62
+ char_emb = self.dual_embeddings.char_embeddings.weight
63
+ logits = x @ char_emb.T + self.mlm_bias
64
+
65
+ logits = x @ char_emb.T + self.mlm_bias
66
+
67
+ # DEBUG: мониторим норму эмбеддингов
68
+ if torch.isnan(logits).any() or torch.isinf(logits).any():
69
+ emb_norm = self.dual_embeddings.char_embeddings.weight.norm()
70
+ x_norm = x.norm()
71
+ raise RuntimeError(
72
+ f"NaN/Inf in logits! char_emb_norm={emb_norm:.2f}, x_norm={x_norm:.2f}"
73
+ )
74
+
75
+ loss = None
76
+ if labels is not None:
77
+ loss_fct = nn.CrossEntropyLoss(ignore_index=-100, label_smoothing=0.1)
78
+ loss = loss_fct(logits.view(-1, self.config.vocab_char_size), labels.view(-1))
79
+
80
+ if not return_dict:
81
+ return (loss, logits) if loss is not None else (logits,)
82
+
83
+ return MaskedLMOutput(loss=loss, logits=logits)
word_vocab.json ADDED
The diff for this file is too large to render. See raw diff