MetaCortex-Dynamics commited on
Commit
83b737d
Β·
verified Β·
1 Parent(s): e5f14b1

Create pipeline/mdlm/tokenizer.py

Browse files
Files changed (1) hide show
  1. pipeline/mdlm/tokenizer.py +235 -0
pipeline/mdlm/tokenizer.py ADDED
@@ -0,0 +1,235 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ MDLM Tokenizer β€” Encodes governed structures as discrete token sequences.
3
+
4
+ The MDLM learns the STRUCTURE of valid operator compositions, not the
5
+ prose content. Evidence strings are metadata for traceability β€” they are
6
+ NOT tokenized. The kernel learns which operators appear in which modalities
7
+ in which order, with which witness attestations.
8
+
9
+ Vocabulary (~32 tokens):
10
+ - 15 operator tokens (THIS through NEAR/FAR)
11
+ - 7 witness tokens (WHAT through WHENCE)
12
+ - 2 witness status tokens (ATTESTED, WITHHELD)
13
+ - 6 channel_b delimiters (<G> </G> <S> </S> <F> </F>)
14
+ - 2 sequence tokens (<BOS> <EOS>)
15
+ - 2 special tokens (<PAD> <MASK>)
16
+
17
+ Total: 34 tokens. Orders of magnitude smaller than prose LLM vocabularies.
18
+ The complexity lives in sequence-level structure, not token identity.
19
+
20
+ Sequence format:
21
+ <BOS> <G> op op op </G> <S> op op </S> <F> op op </F>
22
+ WIT:A WIT:A WIT:A WIT:A WIT:A WIT:A WIT:A <EOS>
23
+
24
+ hierarchical masking tiers:
25
+ Tier 1 (Tier 1): THIS, SAME/NOT-SAME, NO
26
+ Tier 2 (Tier 2): GOES-WITH, TOGETHER/ALONE, MANY/ONE, EVERY/SOME, MORE/LESS, CAN/CANNOT
27
+ Tier 3 (Tier 3 + readiness): INSIDE/OUTSIDE, NEAR/FAR, IF/THEN, BECAUSE,
28
+ MAYBE, MUST/LET, + witness status tokens
29
+ """
30
+
31
+ from __future__ import annotations
32
+
33
+ import json
34
+ from dataclasses import dataclass
35
+ from pathlib import Path
36
+ from typing import Optional
37
+
38
+ from pipeline.types import Op, Witness
39
+
40
+
41
+ # ═══════════════════════════════════════════════════════════════════════════════
42
+ # TOKEN VOCABULARY
43
+ # ═══════════════════════════════════════════════════════════════════════════════
44
+
45
+ # Special tokens
46
+ PAD = 0
47
+ MASK = 1
48
+ BOS = 2
49
+ EOS = 3
50
+
51
+ # Channel B delimiters
52
+ G_OPEN = 4
53
+ G_CLOSE = 5
54
+ S_OPEN = 6
55
+ S_CLOSE = 7
56
+ F_OPEN = 8
57
+ F_CLOSE = 9
58
+
59
+ # 15 operator tokens (indices 10-24, matching Op enum + 10)
60
+ OP_OFFSET = 10
61
+
62
+ # 7 witness tokens (indices 25-31)
63
+ WIT_OFFSET = 25
64
+
65
+ # Witness status
66
+ ATTESTED = 32
67
+ WITHHELD = 33
68
+
69
+ VOCAB_SIZE = 34
70
+
71
+ # Token names for display
72
+ TOKEN_NAMES = [
73
+ "<PAD>", "<MASK>", "<BOS>", "<EOS>",
74
+ "<G>", "</G>", "<S>", "</S>", "<F>", "</F>",
75
+ "THIS", "GOES-WITH", "MANY/ONE", "EVERY/SOME", "NO",
76
+ "IF/THEN", "BECAUSE", "SAME/NOT-SAME", "INSIDE/OUTSIDE",
77
+ "CAN/CANNOT", "MAYBE", "MUST/LET", "TOGETHER/ALONE",
78
+ "MORE/LESS", "NEAR/FAR",
79
+ "WHAT", "WHERE", "WHICH", "WHEN", "FOR-WHAT", "HOW", "WHENCE",
80
+ "ATTESTED", "WITHHELD",
81
+ ]
82
+
83
+ assert len(TOKEN_NAMES) == VOCAB_SIZE
84
+
85
+
86
+ # ═══════════════════════════════════════════════════════════════════════════════
87
+ # hierarchical MASKING TIERS
88
+ # ═══════════════════════════════════════════════════════════════════════════════
89
+
90
+ # Tier 1: Tier 1 (3 operators) β€” unmasked first
91
+ TIER_1_TOKENS = {
92
+ OP_OFFSET + Op.THIS,
93
+ OP_OFFSET + Op.SAME_NOT_SAME,
94
+ OP_OFFSET + Op.NO,
95
+ }
96
+
97
+ # Tier 2: Tier 2 (6 operators) β€” unmasked second
98
+ TIER_2_TOKENS = {
99
+ OP_OFFSET + Op.GOES_WITH,
100
+ OP_OFFSET + Op.TOGETHER_ALONE,
101
+ OP_OFFSET + Op.MANY_ONE,
102
+ OP_OFFSET + Op.EVERY_SOME,
103
+ OP_OFFSET + Op.MORE_LESS,
104
+ OP_OFFSET + Op.CAN_CANNOT,
105
+ }
106
+
107
+ # Tier 3: Tier 3 (6 operators) + witness status (9 total) β€” unmasked last
108
+ TIER_3_TOKENS = {
109
+ OP_OFFSET + Op.INSIDE_OUTSIDE,
110
+ OP_OFFSET + Op.NEAR_FAR,
111
+ OP_OFFSET + Op.IF_THEN,
112
+ OP_OFFSET + Op.BECAUSE,
113
+ OP_OFFSET + Op.MAYBE,
114
+ OP_OFFSET + Op.MUST_LET,
115
+ ATTESTED,
116
+ WITHHELD,
117
+ # Witness identity tokens are also Tier 3 (readiness readiness)
118
+ } | {WIT_OFFSET + w for w in Witness}
119
+
120
+ # Channel B tokens are never masked β€” they define the frame
121
+ NEVER_MASKED = {PAD, BOS, EOS, G_OPEN, G_CLOSE, S_OPEN, S_CLOSE, F_OPEN, F_CLOSE}
122
+
123
+
124
+ # ═══════════════════════════════════════════════════════════════════════════════
125
+ # ENCODE / DECODE
126
+ # ═══════════════════════════════════════════════════════════════════════════════
127
+
128
+ def encode(example: dict) -> list[int]:
129
+ """Encode a FrameExample (from JSONL) as a token sequence.
130
+
131
+ Format:
132
+ <BOS> <G> op... </G> <S> op... </S> <F> op... </F>
133
+ wit:status wit:status ... <EOS>
134
+ """
135
+ tokens = [BOS]
136
+
137
+ # Modalities
138
+ for mod_key, open_tok, close_tok in [
139
+ ("channel_a", G_OPEN, G_CLOSE),
140
+ ("channel_b", S_OPEN, S_CLOSE),
141
+ ("channel_c", F_OPEN, F_CLOSE),
142
+ ]:
143
+ tokens.append(open_tok)
144
+ mod = example.get(mod_key, {})
145
+ for op_entry in mod.get("operators", []):
146
+ op_name = op_entry.get("operator", "")
147
+ op_val = Op.from_name(op_name)
148
+ if op_val is not None:
149
+ tokens.append(OP_OFFSET + op_val.value)
150
+ tokens.append(close_tok)
151
+
152
+ # Witnesses
153
+ for w in Witness:
154
+ wit_data = example.get("witnesses", {}).get(w.canonical_name, {})
155
+ tokens.append(WIT_OFFSET + w.value)
156
+ if wit_data.get("attested", False):
157
+ tokens.append(ATTESTED)
158
+ else:
159
+ tokens.append(WITHHELD)
160
+
161
+ tokens.append(EOS)
162
+ return tokens
163
+
164
+
165
+ def decode(tokens: list[int]) -> str:
166
+ """Decode a token sequence to human-readable string."""
167
+ return " ".join(TOKEN_NAMES[t] if 0 <= t < VOCAB_SIZE else f"?{t}" for t in tokens)
168
+
169
+
170
+ def pad_sequence(tokens: list[int], max_len: int) -> list[int]:
171
+ """Pad or truncate a token sequence to fixed length."""
172
+ if len(tokens) >= max_len:
173
+ return tokens[:max_len]
174
+ return tokens + [PAD] * (max_len - len(tokens))
175
+
176
+
177
+ def get_tier(token_id: int) -> int:
178
+ """Return the masking tier for a token (1, 2, 3, or 0 for never-masked)."""
179
+ if token_id in NEVER_MASKED:
180
+ return 0
181
+ if token_id in TIER_1_TOKENS:
182
+ return 1
183
+ if token_id in TIER_2_TOKENS:
184
+ return 2
185
+ if token_id in TIER_3_TOKENS:
186
+ return 3
187
+ return 0 # unknown tokens are channel_b
188
+
189
+
190
+ # ═══════════════════════════════════════════════════════════════════════════════
191
+ # CORPUS LOADER
192
+ # ═══════════════════════════════════════════════════════════════════════════════
193
+
194
+ def load_corpus(corpus_dir: str | Path) -> list[list[int]]:
195
+ """Load all governed examples from a corpus directory and encode them."""
196
+ corpus_dir = Path(corpus_dir)
197
+ examples_dir = corpus_dir / "examples"
198
+ if not examples_dir.exists():
199
+ examples_dir = corpus_dir
200
+
201
+ sequences = []
202
+ for jsonl_path in sorted(examples_dir.glob("*.jsonl")):
203
+ with open(jsonl_path, encoding="utf-8") as f:
204
+ for line in f:
205
+ line = line.strip()
206
+ if not line:
207
+ continue
208
+ example = json.loads(line)
209
+ tokens = encode(example)
210
+ sequences.append(tokens)
211
+
212
+ return sequences
213
+
214
+
215
+ def corpus_statistics(sequences: list[list[int]]) -> dict:
216
+ """Compute statistics over encoded corpus."""
217
+ from collections import Counter
218
+
219
+ lengths = [len(s) for s in sequences]
220
+ token_counts = Counter()
221
+ tier_counts = Counter()
222
+
223
+ for seq in sequences:
224
+ for t in seq:
225
+ token_counts[t] += 1
226
+ tier_counts[get_tier(t)] += 1
227
+
228
+ return {
229
+ "num_sequences": len(sequences),
230
+ "min_length": min(lengths) if lengths else 0,
231
+ "max_length": max(lengths) if lengths else 0,
232
+ "mean_length": sum(lengths) / len(lengths) if lengths else 0,
233
+ "vocab_usage": {TOKEN_NAMES[t]: c for t, c in token_counts.most_common()},
234
+ "tier_distribution": {f"tier_{t}": c for t, c in sorted(tier_counts.items())},
235
+ }