File size: 6,983 Bytes

29b87da

import json
import random
import re
from pathlib import Path

# Config
COUNT = 2000
SEED_FILE = Path("avh_math/db/text_cross_seed.jsonl")
KB_FILE = Path("avh_math/db/text_cross_kb.jsonl")

# Ensure directories
SEED_FILE.parent.mkdir(parents=True, exist_ok=True)
KB_FILE.parent.mkdir(parents=True, exist_ok=True)

# --- 1. Load Existing Data (for Uniqueness Check) ---
existing_texts = set()
existing_sigs = set()

def load_existing():
    if SEED_FILE.exists():
        with SEED_FILE.open("r", encoding="utf-8") as f:
            for line in f:
                try:
                    obj = json.loads(line)
                    existing_texts.add(obj.get("raw_text", ""))
                except: pass
    
    if KB_FILE.exists():
        with KB_FILE.open("r", encoding="utf-8") as f:
            for line in f:
                try:
                    obj = json.loads(line)
                    sig = tuple(obj.get("structure_signature", []))
                    existing_sigs.add(sig)
                except: pass
    
    print(f"Loaded {len(existing_texts)} existing texts and {len(existing_sigs)} signatures.")

# --- 2. Advanced Generator (Focus on Novelty) ---

# Expanded Vocabulary
JP_PARTS_2 = ["かつ", "または", "すべての", "存在する", "写像", "空間", "演算", "同型", "正規", "可換", "一意", "満たす", "要素"]
EN_PARTS_2 = ["forall", "exists", "map", "space", "op", "iso", "normal", "commute", "unique", "s.t.", "element", "implies", "iff"]
SYMBOLS_2 = ["X", "Y", "Z", "r1", "r2", "theta", "phi", "sum", "prod", "lim", "int", "del", "nabla"]
ARROWS_2 = ["=>", "<=", "<=>", "-->", "<--"]
MODALS_2 = ["Box", "Diamond", "[a]", "<a>", "K", "T", "S4"]
BRACKETS_2 = ["{", "}", "<", ">", "|", "||"]
OTHERS_2 = ["#", "$", "%", "&", "@", "^", "_", "`", "~", ";;", "::"]

def generate_novel_text():
    # Strategy: Vary length drastically, mix types aggressively
    length = random.choice([2, 3, 4, 15, 20, 25]) # Very short or very long
    
    parts = []
    
    # Mode selection for structural diversity
    mode = random.choice(["dense_symbol", "verbose_text", "bracket_mess", "arrow_chain", "mixed_chaos"])
    
    for _ in range(length):
        if mode == "dense_symbol":
            parts.append(random.choice(SYMBOLS_2 + ARROWS_2 + MODALS_2 + OTHERS_2))
        elif mode == "verbose_text":
            parts.append(random.choice(JP_PARTS_2 + EN_PARTS_2))
        elif mode == "bracket_mess":
            parts.append(random.choice(BRACKETS_2 + SYMBOLS_2))
        elif mode == "arrow_chain":
            parts.append(random.choice(ARROWS_2 + SYMBOLS_2))
        else: # mixed_chaos
            parts.append(random.choice(JP_PARTS_2 + EN_PARTS_2 + SYMBOLS_2 + ARROWS_2 + MODALS_2 + BRACKETS_2 + OTHERS_2))
            
    # Assemble
    text = ""
    for p in parts:
        if random.random() < 0.5: # 50% chance of no space to create weird tokens
            text += p
        else:
            text += " " + p
            
    return text.strip()

# --- 3. Decomposition Logic (Consistent with previous) ---

def classify_shape(token):
    if token in ["->", "→", "=>", "<-", "<=", "<=>", "-->", "<--"]:
        return "arrow"
    if token in ["[]", "□", "<>", "Diamond", "Box", "[a]", "<a>"]:
        return "modal"
    if token in ["(", ")", "[", "]", "{", "}", "\"", "'", "<", ">", "|", "||"]:
        return "bracket"
    
    # Symbol vs Word vs Other
    if re.match(r"^[A-Za-z0-9]$", token):
        return "symbol"
    if re.match(r"^[A-Za-z0-9_]+", token) and len(token) > 1:
        return "word"
    if any("\u3000" <= c <= "\u9faf" for c in token):
        return "word"
    
    return "other"

def decompose_text(text):
    # Regex to capture all new symbols
    # Prioritize multi-char tokens
    pattern = r"(->|=>|<->|\[\]|<>|Box|Diamond|-->|<--|\[a\]|<a>|[A-Za-z0-9_]+|[\u3000-\u9faf]+|[^\s])"
    tokens = [t for t in re.findall(pattern, text) if t.strip()]
    
    shapes = []
    for i, t in enumerate(tokens):
        shapes.append({
            "token": t,
            "shape": classify_shape(t),
            "position": i
        })
        
    # Notes
    notes = []
    shape_types = [s["shape"] for s in shapes]
    
    if "arrow" in shape_types:
        notes.append("arrow_detected")
    if "modal" in shape_types:
        notes.append("modal_detected")
    if "\"" in text or "'" in text:
        notes.append("quoted_segment")
    
    has_jp = any(any("\u3000" <= c <= "\u9faf" for c in t) for t in tokens)
    has_en = any(re.search(r"[a-zA-Z]", t) for t in tokens)
    if has_jp and has_en:
        notes.append("mixed_language")
    
    # Bracket check (extended)
    stack = []
    unbalanced = False
    pairs = {")": "(", "]": "[", "}": "{", ">": "<"}
    for t in tokens:
        if t in pairs.values():
            stack.append(t)
        elif t in pairs:
            if not stack or stack[-1] != pairs[t]:
                unbalanced = True
                break
            stack.pop()
    if stack:
        unbalanced = True
    if unbalanced:
        notes.append("unbalanced_bracket")
    
    # Formula-like check
    symbol_ratio = shape_types.count("symbol") / len(shape_types) if shape_types else 0
    if symbol_ratio > 0.3 or "arrow" in shape_types or "modal" in shape_types:
        notes.append("formula_like_sequence")

    return {
        "raw_text": text,
        "tokens": tokens,
        "shapes": shapes,
        "structure_signature": shape_types,
        "notes": notes
    }

def main():
    load_existing()
    print(f"Generating {COUNT} NEW unique entries...")
    
    generated_count = 0
    attempts = 0
    
    buffer_seed = []
    buffer_kb = []
    
    while generated_count < COUNT:
        attempts += 1
        text = generate_novel_text()
        
        # Text uniqueness check
        if text in existing_texts:
            continue
            
        # Decompose
        kb_data = decompose_text(text)
        sig = tuple(kb_data["structure_signature"])
        
        # Structure uniqueness check (Strict)
        # If this exact structure exists, we skip it to force diversity
        if sig in existing_sigs:
            continue
            
        # It's unique! Add to sets
        existing_texts.add(text)
        existing_sigs.add(sig)
        
        buffer_seed.append({"raw_text": text})
        buffer_kb.append(kb_data)
        
        generated_count += 1
        
        if generated_count % 500 == 0:
            print(f"Generated {generated_count} items (Attempts: {attempts})")
            
    print(f"Writing to files...")
    
    with SEED_FILE.open("a", encoding="utf-8") as fs:
        for item in buffer_seed:
            fs.write(json.dumps(item, ensure_ascii=False) + "\n")
            
    with KB_FILE.open("a", encoding="utf-8") as fk:
        for item in buffer_kb:
            fk.write(json.dumps(item, ensure_ascii=False) + "\n")
            
    print("Done.")

if __name__ == "__main__":
    main()