#!/usr/bin/env python3 import json import hashlib import random from pathlib import Path OUT_DIR = Path("data") SEED = 918 random.seed(SEED) TRAIN_SIZE = 5000 VAL_SIZE = 1000 TEST_SIZE = 1000 VISIBLE_GLYPHS = [ "✶","✷","✸","✹","✺","✻","✼","✽", "✾","✿","❀","❁","❂","❃","❄","❅" ] BRAILLE_STATES = [ "⠁","⠃","⠇","⠏","⠟","⠿","⡇","⡿", "⡟","⡯","⡷","⡻","⠻","⠽","⠷","⢿" ] HANZI_CONTEXT = [ "一","二","三","四","五","六","七","八", "九","十","百","千","万","亿","兆","世" ] def compute_digest(visible: str, braille: str, hanzi: str) -> str: payload = f"{visible}|{braille}|{hanzi}".encode("utf-8") return hashlib.sha3_256(payload).hexdigest() def semantic_weight(visible: str, braille: str, hanzi: str) -> float: v = VISIBLE_GLYPHS.index(visible) / max(1, len(VISIBLE_GLYPHS) - 1) b = BRAILLE_STATES.index(braille) / max(1, len(BRAILLE_STATES) - 1) h = HANZI_CONTEXT.index(hanzi) / max(1, len(HANZI_CONTEXT) - 1) return round(0.4 * v + 0.3 * b + 0.3 * h, 6) def generate_row(idx: int) -> dict: visible = random.choice(VISIBLE_GLYPHS) braille = random.choice(BRAILLE_STATES) hanzi = random.choice(HANZI_CONTEXT) return { "glyph_id": f"glyph_{idx:08d}", "visible": visible, "braille": braille, "hanzi": hanzi, "semantic_weight": semantic_weight(visible, braille, hanzi), "digest": compute_digest(visible, braille, hanzi), "tri_key": { "visible_layer": visible, "state_layer": braille, "context_layer": hanzi, }, } def write_split(path: Path, start: int, size: int) -> None: with path.open("w", encoding="utf-8") as f: for i in range(size): row = generate_row(start + i) f.write(json.dumps(row, ensure_ascii=False) + "\n") def main() -> None: OUT_DIR.mkdir(parents=True, exist_ok=True) write_split(OUT_DIR / "train.jsonl", 0, TRAIN_SIZE) write_split(OUT_DIR / "validation.jsonl", TRAIN_SIZE, VAL_SIZE) write_split(OUT_DIR / "test.jsonl", TRAIN_SIZE + VAL_SIZE, TEST_SIZE) print("Built dataset at ./data") if __name__ == "__main__": main()