Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| import json | |
| import hashlib | |
| import random | |
| from pathlib import Path | |
| OUT_DIR = Path("data") | |
| SEED = 918 | |
| random.seed(SEED) | |
| TRAIN_SIZE = 5000 | |
| VAL_SIZE = 1000 | |
| TEST_SIZE = 1000 | |
| VISIBLE_GLYPHS = [ | |
| "✶","✷","✸","✹","✺","✻","✼","✽", | |
| "✾","✿","❀","❁","❂","❃","❄","❅" | |
| ] | |
| BRAILLE_STATES = [ | |
| "⠁","⠃","⠇","⠏","⠟","⠿","⡇","⡿", | |
| "⡟","⡯","⡷","⡻","⠻","⠽","⠷","⢿" | |
| ] | |
| HANZI_CONTEXT = [ | |
| "一","二","三","四","五","六","七","八", | |
| "九","十","百","千","万","亿","兆","世" | |
| ] | |
| def compute_digest(visible: str, braille: str, hanzi: str) -> str: | |
| payload = f"{visible}|{braille}|{hanzi}".encode("utf-8") | |
| return hashlib.sha3_256(payload).hexdigest() | |
| def semantic_weight(visible: str, braille: str, hanzi: str) -> float: | |
| v = VISIBLE_GLYPHS.index(visible) / max(1, len(VISIBLE_GLYPHS) - 1) | |
| b = BRAILLE_STATES.index(braille) / max(1, len(BRAILLE_STATES) - 1) | |
| h = HANZI_CONTEXT.index(hanzi) / max(1, len(HANZI_CONTEXT) - 1) | |
| return round(0.4 * v + 0.3 * b + 0.3 * h, 6) | |
| def generate_row(idx: int) -> dict: | |
| visible = random.choice(VISIBLE_GLYPHS) | |
| braille = random.choice(BRAILLE_STATES) | |
| hanzi = random.choice(HANZI_CONTEXT) | |
| return { | |
| "glyph_id": f"glyph_{idx:08d}", | |
| "visible": visible, | |
| "braille": braille, | |
| "hanzi": hanzi, | |
| "semantic_weight": semantic_weight(visible, braille, hanzi), | |
| "digest": compute_digest(visible, braille, hanzi), | |
| "tri_key": { | |
| "visible_layer": visible, | |
| "state_layer": braille, | |
| "context_layer": hanzi, | |
| }, | |
| } | |
| def write_split(path: Path, start: int, size: int) -> None: | |
| with path.open("w", encoding="utf-8") as f: | |
| for i in range(size): | |
| row = generate_row(start + i) | |
| f.write(json.dumps(row, ensure_ascii=False) + "\n") | |
| def main() -> None: | |
| OUT_DIR.mkdir(parents=True, exist_ok=True) | |
| write_split(OUT_DIR / "train.jsonl", 0, TRAIN_SIZE) | |
| write_split(OUT_DIR / "validation.jsonl", TRAIN_SIZE, VAL_SIZE) | |
| write_split(OUT_DIR / "test.jsonl", TRAIN_SIZE + VAL_SIZE, TEST_SIZE) | |
| print("Built dataset at ./data") | |
| if __name__ == "__main__": | |
| main() | |