File size: 6,983 Bytes
29b87da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
import json
import random
import re
from pathlib import Path

# Config
COUNT = 2000
SEED_FILE = Path("avh_math/db/text_cross_seed.jsonl")
KB_FILE = Path("avh_math/db/text_cross_kb.jsonl")

# Ensure directories
SEED_FILE.parent.mkdir(parents=True, exist_ok=True)
KB_FILE.parent.mkdir(parents=True, exist_ok=True)

# --- 1. Load Existing Data (for Uniqueness Check) ---
existing_texts = set()
existing_sigs = set()

def load_existing():
    if SEED_FILE.exists():
        with SEED_FILE.open("r", encoding="utf-8") as f:
            for line in f:
                try:
                    obj = json.loads(line)
                    existing_texts.add(obj.get("raw_text", ""))
                except: pass
    
    if KB_FILE.exists():
        with KB_FILE.open("r", encoding="utf-8") as f:
            for line in f:
                try:
                    obj = json.loads(line)
                    sig = tuple(obj.get("structure_signature", []))
                    existing_sigs.add(sig)
                except: pass
    
    print(f"Loaded {len(existing_texts)} existing texts and {len(existing_sigs)} signatures.")

# --- 2. Advanced Generator (Focus on Novelty) ---

# Expanded Vocabulary
JP_PARTS_2 = ["かつ", "または", "すべての", "存在する", "写像", "空間", "演算", "同型", "正規", "可換", "一意", "満たす", "要素"]
EN_PARTS_2 = ["forall", "exists", "map", "space", "op", "iso", "normal", "commute", "unique", "s.t.", "element", "implies", "iff"]
SYMBOLS_2 = ["X", "Y", "Z", "r1", "r2", "theta", "phi", "sum", "prod", "lim", "int", "del", "nabla"]
ARROWS_2 = ["=>", "<=", "<=>", "-->", "<--"]
MODALS_2 = ["Box", "Diamond", "[a]", "<a>", "K", "T", "S4"]
BRACKETS_2 = ["{", "}", "<", ">", "|", "||"]
OTHERS_2 = ["#", "$", "%", "&", "@", "^", "_", "`", "~", ";;", "::"]

def generate_novel_text():
    # Strategy: Vary length drastically, mix types aggressively
    length = random.choice([2, 3, 4, 15, 20, 25]) # Very short or very long
    
    parts = []
    
    # Mode selection for structural diversity
    mode = random.choice(["dense_symbol", "verbose_text", "bracket_mess", "arrow_chain", "mixed_chaos"])
    
    for _ in range(length):
        if mode == "dense_symbol":
            parts.append(random.choice(SYMBOLS_2 + ARROWS_2 + MODALS_2 + OTHERS_2))
        elif mode == "verbose_text":
            parts.append(random.choice(JP_PARTS_2 + EN_PARTS_2))
        elif mode == "bracket_mess":
            parts.append(random.choice(BRACKETS_2 + SYMBOLS_2))
        elif mode == "arrow_chain":
            parts.append(random.choice(ARROWS_2 + SYMBOLS_2))
        else: # mixed_chaos
            parts.append(random.choice(JP_PARTS_2 + EN_PARTS_2 + SYMBOLS_2 + ARROWS_2 + MODALS_2 + BRACKETS_2 + OTHERS_2))
            
    # Assemble
    text = ""
    for p in parts:
        if random.random() < 0.5: # 50% chance of no space to create weird tokens
            text += p
        else:
            text += " " + p
            
    return text.strip()

# --- 3. Decomposition Logic (Consistent with previous) ---

def classify_shape(token):
    if token in ["->", "→", "=>", "<-", "<=", "<=>", "-->", "<--"]:
        return "arrow"
    if token in ["[]", "□", "<>", "Diamond", "Box", "[a]", "<a>"]:
        return "modal"
    if token in ["(", ")", "[", "]", "{", "}", "\"", "'", "<", ">", "|", "||"]:
        return "bracket"
    
    # Symbol vs Word vs Other
    if re.match(r"^[A-Za-z0-9]$", token):
        return "symbol"
    if re.match(r"^[A-Za-z0-9_]+", token) and len(token) > 1:
        return "word"
    if any("\u3000" <= c <= "\u9faf" for c in token):
        return "word"
    
    return "other"

def decompose_text(text):
    # Regex to capture all new symbols
    # Prioritize multi-char tokens
    pattern = r"(->|=>|<->|\[\]|<>|Box|Diamond|-->|<--|\[a\]|<a>|[A-Za-z0-9_]+|[\u3000-\u9faf]+|[^\s])"
    tokens = [t for t in re.findall(pattern, text) if t.strip()]
    
    shapes = []
    for i, t in enumerate(tokens):
        shapes.append({
            "token": t,
            "shape": classify_shape(t),
            "position": i
        })
        
    # Notes
    notes = []
    shape_types = [s["shape"] for s in shapes]
    
    if "arrow" in shape_types:
        notes.append("arrow_detected")
    if "modal" in shape_types:
        notes.append("modal_detected")
    if "\"" in text or "'" in text:
        notes.append("quoted_segment")
    
    has_jp = any(any("\u3000" <= c <= "\u9faf" for c in t) for t in tokens)
    has_en = any(re.search(r"[a-zA-Z]", t) for t in tokens)
    if has_jp and has_en:
        notes.append("mixed_language")
    
    # Bracket check (extended)
    stack = []
    unbalanced = False
    pairs = {")": "(", "]": "[", "}": "{", ">": "<"}
    for t in tokens:
        if t in pairs.values():
            stack.append(t)
        elif t in pairs:
            if not stack or stack[-1] != pairs[t]:
                unbalanced = True
                break
            stack.pop()
    if stack:
        unbalanced = True
    if unbalanced:
        notes.append("unbalanced_bracket")
    
    # Formula-like check
    symbol_ratio = shape_types.count("symbol") / len(shape_types) if shape_types else 0
    if symbol_ratio > 0.3 or "arrow" in shape_types or "modal" in shape_types:
        notes.append("formula_like_sequence")

    return {
        "raw_text": text,
        "tokens": tokens,
        "shapes": shapes,
        "structure_signature": shape_types,
        "notes": notes
    }

def main():
    load_existing()
    print(f"Generating {COUNT} NEW unique entries...")
    
    generated_count = 0
    attempts = 0
    
    buffer_seed = []
    buffer_kb = []
    
    while generated_count < COUNT:
        attempts += 1
        text = generate_novel_text()
        
        # Text uniqueness check
        if text in existing_texts:
            continue
            
        # Decompose
        kb_data = decompose_text(text)
        sig = tuple(kb_data["structure_signature"])
        
        # Structure uniqueness check (Strict)
        # If this exact structure exists, we skip it to force diversity
        if sig in existing_sigs:
            continue
            
        # It's unique! Add to sets
        existing_texts.add(text)
        existing_sigs.add(sig)
        
        buffer_seed.append({"raw_text": text})
        buffer_kb.append(kb_data)
        
        generated_count += 1
        
        if generated_count % 500 == 0:
            print(f"Generated {generated_count} items (Attempts: {attempts})")
            
    print(f"Writing to files...")
    
    with SEED_FILE.open("a", encoding="utf-8") as fs:
        for item in buffer_seed:
            fs.write(json.dumps(item, ensure_ascii=False) + "\n")
            
    with KB_FILE.open("a", encoding="utf-8") as fk:
        for item in buffer_kb:
            fk.write(json.dumps(item, ensure_ascii=False) + "\n")
            
    print("Done.")

if __name__ == "__main__":
    main()