vil-encoder / vil_dataset_builder.py
Nine1Eight
Initial Linux build for VIL encoder
e566f33
#!/usr/bin/env python3
import json
import hashlib
import random
from pathlib import Path
OUT_DIR = Path("data")
SEED = 918
random.seed(SEED)
TRAIN_SIZE = 5000
VAL_SIZE = 1000
TEST_SIZE = 1000
VISIBLE_GLYPHS = [
"✶","✷","✸","✹","✺","✻","✼","✽",
"✾","✿","❀","❁","❂","❃","❄","❅"
]
BRAILLE_STATES = [
"⠁","⠃","⠇","⠏","⠟","⠿","⡇","⡿",
"⡟","⡯","⡷","⡻","⠻","⠽","⠷","⢿"
]
HANZI_CONTEXT = [
"一","二","三","四","五","六","七","八",
"九","十","百","千","万","亿","兆","世"
]
def compute_digest(visible: str, braille: str, hanzi: str) -> str:
payload = f"{visible}|{braille}|{hanzi}".encode("utf-8")
return hashlib.sha3_256(payload).hexdigest()
def semantic_weight(visible: str, braille: str, hanzi: str) -> float:
v = VISIBLE_GLYPHS.index(visible) / max(1, len(VISIBLE_GLYPHS) - 1)
b = BRAILLE_STATES.index(braille) / max(1, len(BRAILLE_STATES) - 1)
h = HANZI_CONTEXT.index(hanzi) / max(1, len(HANZI_CONTEXT) - 1)
return round(0.4 * v + 0.3 * b + 0.3 * h, 6)
def generate_row(idx: int) -> dict:
visible = random.choice(VISIBLE_GLYPHS)
braille = random.choice(BRAILLE_STATES)
hanzi = random.choice(HANZI_CONTEXT)
return {
"glyph_id": f"glyph_{idx:08d}",
"visible": visible,
"braille": braille,
"hanzi": hanzi,
"semantic_weight": semantic_weight(visible, braille, hanzi),
"digest": compute_digest(visible, braille, hanzi),
"tri_key": {
"visible_layer": visible,
"state_layer": braille,
"context_layer": hanzi,
},
}
def write_split(path: Path, start: int, size: int) -> None:
with path.open("w", encoding="utf-8") as f:
for i in range(size):
row = generate_row(start + i)
f.write(json.dumps(row, ensure_ascii=False) + "\n")
def main() -> None:
OUT_DIR.mkdir(parents=True, exist_ok=True)
write_split(OUT_DIR / "train.jsonl", 0, TRAIN_SIZE)
write_split(OUT_DIR / "validation.jsonl", TRAIN_SIZE, VAL_SIZE)
write_split(OUT_DIR / "test.jsonl", TRAIN_SIZE + VAL_SIZE, TEST_SIZE)
print("Built dataset at ./data")
if __name__ == "__main__":
main()