everydaytok commited on
Commit
afe086f
Β·
verified Β·
1 Parent(s): 387f0e3

Update data_gen.py

Browse files
Files changed (1) hide show
  1. data_gen.py +115 -147
data_gen.py CHANGED
@@ -1,61 +1,43 @@
1
  """
2
  data_gen.py β€” Training / test data for the elastic mesh.
3
 
4
- Each sample is a triple (A, B, C) where:
5
- A ∈ ℝ^DIM encodes constraints ("what must be true")
6
- B ∈ ℝ^DIM encodes objectives ("what we want")
7
- C ∈ ℝ^DIM is the analytic solution β€” the feasibility center the mesh must learn to produce
8
-
9
- Five problem families, each with a geometrically distinct C:
10
-
11
- 1. box_proj β€” clamp B into axis-aligned box defined by A
12
- 2. halfspace β€” project B onto hyperplane defined by A
13
- 3. sphere β€” project B onto sphere surface defined by A
14
- 4. simplex β€” project B onto probability simplex (A = uniform prior signal)
15
- 5. elastic_bal β€” per-dimension weighted balance between A-center and B
16
-
17
- These cover:
18
- - Bounded feasibility (box)
19
- - Equality constraints (halfspace)
20
- - Norm constraints (sphere)
21
- - Probability/sum=1 (simplex)
22
- - Soft trade-offs (elastic)
23
-
24
- The mesh sees ONLY (A, B) during inference; C is what it must reconstruct.
25
  """
26
 
27
  import numpy as np
28
  import json, pathlib, argparse
29
  from typing import List, Dict
30
 
31
- DIM = 32 # embedding dimension (set to 768 for LLM-scale)
32
- SAMPLES_PER_TYPE = 1000 # Γ— 5 types = 5 000 total
 
33
 
34
 
35
  # ── UTILITIES ─────────────────────────────────────────────────────────────────
36
 
37
- def normalize(v: np.ndarray) -> np.ndarray:
38
- n = np.linalg.norm(v)
39
- return v / (n + 1e-12)
40
 
41
- def pack(*arrays: np.ndarray, dim: int) -> np.ndarray:
42
- """Concatenate + trim/pad to `dim`."""
43
  v = np.concatenate(arrays)
44
- if len(v) >= dim:
45
- return v[:dim]
46
- return np.pad(v, (0, dim - len(v)))
47
 
48
 
49
- # ── PROBLEM TYPE 1: BOX PROJECTION ────────────────────────────────────────────
50
- #
51
- # Constraint A : encodes per-dimension box [lo, hi]
52
- # A[:D/2] = lo[:D/2], A[D/2:] = hi[:D/2]
53
- # Objective B : unconstrained target point in ℝ^D
54
- # Solution C : clip(B, lo, hi) β€” nearest point in box to B
55
- #
56
- # Meaning: "stay within resource/capacity bounds while aiming for B"
57
 
58
- def gen_box(n: int, dim: int, rng: np.random.Generator) -> List[Dict]:
59
  data = []
60
  for _ in range(n):
61
  center = rng.uniform(-2, 2, dim)
@@ -68,40 +50,40 @@ def gen_box(n: int, dim: int, rng: np.random.Generator) -> List[Dict]:
68
  return data
69
 
70
 
71
- # ── PROBLEM TYPE 2: HALFSPACE PROJECTION ──────────────────────────────────────
72
- #
73
- # Constraint A : encodes a hyperplane nα΅€x = b
74
- # A = normal vector, A[0] carries the offset b
75
- # Objective B : unconstrained point in ℝ^D
76
- # Solution C : projection of B onto the hyperplane
77
- # C = B βˆ’ (nα΅€B βˆ’ b) Β· n
78
- #
79
- # Meaning: "satisfy one hard equality constraint at minimum cost to B"
80
 
81
- def gen_halfspace(n: int, dim: int, rng: np.random.Generator) -> List[Dict]:
82
  data = []
83
  for _ in range(n):
84
- normal = normalize(rng.standard_normal(dim))
85
  b = float(rng.uniform(-1, 1))
86
  B = rng.uniform(-3, 3, dim)
87
  C = B - (float(np.dot(normal, B)) - b) * normal
88
- A = normal.copy()
89
- A[0] = b # offset embedded in first slot
90
  data.append({'A': A.tolist(), 'B': B.tolist(), 'C': C.tolist(), 'type': 'halfspace'})
91
  return data
92
 
93
 
94
- # ── PROBLEM TYPE 3: SPHERE SURFACE ────────────────────────────────────────────
95
- #
96
- # Constraint A : encodes a sphere (center, radius)
97
- # A = center vector, A[0] overwritten with radius r
98
- # Objective B : external point
99
- # Solution C : point on sphere surface nearest to B
100
- # C = center + r Β· (B βˆ’ center) / β€–B βˆ’ centerβ€–
101
- #
102
- # Meaning: "satisfy a norm/budget constraint, move toward B as far as allowed"
103
 
104
- def gen_sphere(n: int, dim: int, rng: np.random.Generator) -> List[Dict]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
  data = []
106
  for _ in range(n):
107
  center = rng.uniform(-1.5, 1.5, dim)
@@ -110,134 +92,120 @@ def gen_sphere(n: int, dim: int, rng: np.random.Generator) -> List[Dict]:
110
  diff = B - center
111
  nd = np.linalg.norm(diff)
112
  if nd < 1e-10:
113
- diff = np.ones(dim) / np.sqrt(dim)
114
- nd = 1.0
115
  C = center + r * diff / nd
116
- A = center.copy()
117
- A[0] = r # radius in first slot
118
  data.append({'A': A.tolist(), 'B': B.tolist(), 'C': C.tolist(), 'type': 'sphere'})
119
  return data
120
 
121
 
122
- # ── PROBLEM TYPE 4: SIMPLEX PROJECTION ────────────────────────────────────────
123
- #
124
- # Constraint A : uniform-prior signal (all ones) β†’ encodes simplex constraint Ξ£xα΅’=1, xα΅’β‰₯0
125
- # Objective B : unconstrained "belief" vector
126
- # Solution C : nearest point on probability simplex to B
127
- #
128
- # Meaning: "find a valid probability distribution closest to unconstrained belief B"
129
- # Useful for softmax-like problems.
130
 
131
- def _proj_simplex(v: np.ndarray) -> np.ndarray:
132
  n = len(v)
133
  u = np.sort(v)[::-1]
134
  cs = np.cumsum(u) - 1.0
135
- rho = int(np.where(u * np.arange(1, n + 1) > cs)[0][-1])
136
  theta = cs[rho] / (rho + 1.0)
137
  return np.maximum(v - theta, 0.0)
138
 
139
- def gen_simplex(n: int, dim: int, rng: np.random.Generator) -> List[Dict]:
140
  data = []
141
  for _ in range(n):
142
- A = np.ones(dim) # simplex constraint signal
143
- B = rng.uniform(-1.0, 3.0, dim) # unconstrained belief
144
  C = _proj_simplex(B)
145
  data.append({'A': A.tolist(), 'B': B.tolist(), 'C': C.tolist(), 'type': 'simplex'})
146
  return data
147
 
148
 
149
- # ── PROBLEM TYPE 5: ELASTIC BALANCE ───────────────────────────────────────────
150
- #
151
- # Constraint A : encodes soft constraint center + per-dimension tightness weight w ∈ [0,1]
152
- # A[:D/2] = constraint centers, A[D/2:] = tightness weights
153
- # Objective B : desired goal point
154
- # Solution C : per-dimension elastic balance
155
- # C[j] = w[j] Β· a_center[j] + (1 βˆ’ w[j]) Β· B[j]
156
- #
157
- # Meaning: "each dimension is pulled between constraint center and objective,
158
- # with w[j] controlling how hard the constraint is in that dimension"
159
- # This is the natural problem for the elastic mesh.
160
-
161
- def gen_elastic(n: int, dim: int, rng: np.random.Generator) -> List[Dict]:
162
- data = []
163
- for _ in range(n):
164
- a_center = rng.uniform(-2, 2, dim)
165
- w = rng.uniform(0.05, 0.95, dim) # per-dim tightness
166
- B = rng.uniform(-3, 3, dim)
167
- C = w * a_center + (1.0 - w) * B
168
- A = pack(a_center[:dim//2], w[:dim//2], dim=dim)
169
- data.append({'A': A.tolist(), 'B': B.tolist(), 'C': C.tolist(), 'type': 'elastic'})
170
- return data
171
-
172
-
173
  # ── ASSEMBLY ──────────────────────────────────────────────────────────────────
174
 
175
- GENERATORS = {
176
  'box_proj': gen_box,
177
  'halfspace': gen_halfspace,
178
- 'sphere': gen_sphere,
179
- 'simplex': gen_simplex,
180
  'elastic': gen_elastic,
181
  }
 
 
 
 
 
182
 
183
- def generate_all(n_per_type: int = SAMPLES_PER_TYPE,
184
- dim: int = DIM,
185
- seed: int = 42) -> List[Dict]:
186
  rng = np.random.default_rng(seed)
187
  data = []
188
- for fn in GENERATORS.values():
189
  data.extend(fn(n_per_type, dim, rng))
190
  idx = rng.permutation(len(data))
191
  return [data[i] for i in idx]
192
 
193
 
194
- # ── MAIN ──────────────────────────────────────────────────────────────────────
195
-
196
  if __name__ == '__main__':
197
- parser = argparse.ArgumentParser(description='Generate elastic mesh training data')
198
- parser.add_argument('--dim', type=int, default=DIM, help='embedding dimension')
199
- parser.add_argument('--n', type=int, default=SAMPLES_PER_TYPE, help='samples per problem type')
200
- parser.add_argument('--out', type=str, default='data', help='output directory')
201
  args = parser.parse_args()
202
 
203
- print(f"\n{'─'*50}")
204
  print(f" Generating {5 * args.n} samples | dim={args.dim}")
205
- print(f"{'─'*50}")
206
-
207
- data = generate_all(args.n, args.dim)
208
- split = int(len(data) * 0.9)
209
- train, test = data[:split], data[split:]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
210
 
211
  out = pathlib.Path(args.out)
212
  out.mkdir(exist_ok=True)
213
  with open(out / 'train.json', 'w') as f: json.dump(train, f)
214
  with open(out / 'test.json', 'w') as f: json.dump(test, f)
215
 
216
- # Per-type statistics
217
  from collections import Counter
218
- train_types = Counter(d['type'] for d in train)
219
- test_types = Counter(d['type'] for d in test)
220
-
221
- print(f"\n Train : {len(train)}")
222
- print(f" Test : {len(test)}\n")
223
- print(f" {'Type':<14} {'Train':>8} {'Test':>7} C-norm (mean)")
224
- print(f" {'─'*14} {'─'*8} {'─'*7} {'─'*14}")
225
- for t in GENERATORS:
226
- subset = [d for d in data if d['type'] == t]
227
- norms = [np.linalg.norm(d['C']) for d in subset]
228
- print(f" {t:<14} {train_types[t]:>8} {test_types[t]:>7} "
229
- f"{np.mean(norms):.3f} Β± {np.std(norms):.3f}")
230
-
231
- # Sanity check one sample per type
232
- print(f"\n Sanity check (first sample per type):")
233
- seen = set()
234
- for d in data:
235
- if d['type'] in seen: continue
236
- seen.add(d['type'])
237
  A, B, C = map(np.array, [d['A'], d['B'], d['C']])
238
- err = np.linalg.norm(A - B)
239
- print(f" [{d['type']:<12}] "
240
- f"β€–Aβ€–={np.linalg.norm(A):.2f} β€–Bβ€–={np.linalg.norm(B):.2f} "
241
- f"β€–Cβ€–={np.linalg.norm(C):.2f} β€–A-Bβ€–={err:.2f}")
242
 
243
  print(f"\n Saved β†’ {out}/train.json {out}/test.json\n")
 
1
  """
2
  data_gen.py β€” Training / test data for the elastic mesh.
3
 
4
+ OOD TEST DESIGN
5
+ ───────────────
6
+ SEEN during training : box_proj | halfspace | elastic
7
+ UNSEEN (OOD) at test : sphere | simplex
8
+
9
+ This lets us distinguish:
10
+ β€’ Memorisation β†’ high acc on seen, low acc on unseen
11
+ β€’ Geometry β†’ high acc on both (the real claim)
12
+
13
+ Each sample: (A, B, C) where A=constraints, B=objectives, C=feasibility center.
14
+ DIM = 64 (double from previous run, stress-tests before LLM scale).
 
 
 
 
 
 
 
 
 
 
15
  """
16
 
17
  import numpy as np
18
  import json, pathlib, argparse
19
  from typing import List, Dict
20
 
21
+
22
+ DIM = 64
23
+ SAMPLES_PER_TYPE = 1000 # Γ— 5 types = 5 000 total
24
 
25
 
26
  # ── UTILITIES ─────────────────────────────────────────────────────────────────
27
 
28
+ def norm(v: np.ndarray) -> np.ndarray:
29
+ return v / (np.linalg.norm(v) + 1e-12)
 
30
 
31
+ def pack(*arrays, dim):
 
32
  v = np.concatenate(arrays)
33
+ return v[:dim] if len(v) >= dim else np.pad(v, (0, dim - len(v)))
 
 
34
 
35
 
36
+ # ── PROBLEM TYPE 1 (SEEN): BOX PROJECTION ────────────────────────────────────
37
+ # C = clip(B, lo, hi)
38
+ # A encodes the box bounds
 
 
 
 
 
39
 
40
+ def gen_box(n, dim, rng):
41
  data = []
42
  for _ in range(n):
43
  center = rng.uniform(-2, 2, dim)
 
50
  return data
51
 
52
 
53
+ # ── PROBLEM TYPE 2 (SEEN): HALFSPACE PROJECTION ───────────────────────────────
54
+ # C = B βˆ’ (nα΅€B βˆ’ b)Β·n (project B onto hyperplane nα΅€x = b)
 
 
 
 
 
 
 
55
 
56
+ def gen_halfspace(n, dim, rng):
57
  data = []
58
  for _ in range(n):
59
+ normal = norm(rng.standard_normal(dim))
60
  b = float(rng.uniform(-1, 1))
61
  B = rng.uniform(-3, 3, dim)
62
  C = B - (float(np.dot(normal, B)) - b) * normal
63
+ A = normal.copy(); A[0] = b
 
64
  data.append({'A': A.tolist(), 'B': B.tolist(), 'C': C.tolist(), 'type': 'halfspace'})
65
  return data
66
 
67
 
68
+ # ── PROBLEM TYPE 3 (SEEN): ELASTIC BALANCE ────────────────────────────────────
69
+ # C[j] = w[j]Β·a_center[j] + (1βˆ’w[j])Β·B[j] per-dimension soft trade-off
 
 
 
 
 
 
 
70
 
71
+ def gen_elastic(n, dim, rng):
72
+ data = []
73
+ for _ in range(n):
74
+ a_center = rng.uniform(-2, 2, dim)
75
+ w = rng.uniform(0.05, 0.95, dim)
76
+ B = rng.uniform(-3, 3, dim)
77
+ C = w * a_center + (1.0 - w) * B
78
+ A = pack(a_center[:dim//2], w[:dim//2], dim=dim)
79
+ data.append({'A': A.tolist(), 'B': B.tolist(), 'C': C.tolist(), 'type': 'elastic'})
80
+ return data
81
+
82
+
83
+ # ── PROBLEM TYPE 4 (OOD): SPHERE SURFACE ─────────────────────────────────────
84
+ # C = center + rΒ·(Bβˆ’center)/β€–Bβˆ’centerβ€– (nearest point on sphere to B)
85
+
86
+ def gen_sphere(n, dim, rng):
87
  data = []
88
  for _ in range(n):
89
  center = rng.uniform(-1.5, 1.5, dim)
 
92
  diff = B - center
93
  nd = np.linalg.norm(diff)
94
  if nd < 1e-10:
95
+ diff = np.ones(dim) / np.sqrt(dim); nd = 1.0
 
96
  C = center + r * diff / nd
97
+ A = center.copy(); A[0] = r
 
98
  data.append({'A': A.tolist(), 'B': B.tolist(), 'C': C.tolist(), 'type': 'sphere'})
99
  return data
100
 
101
 
102
+ # ── PROBLEM TYPE 5 (OOD): SIMPLEX PROJECTION ─────────────────────────────────
103
+ # C = nearest point on probability simplex to B (Ξ£xα΅’=1, xα΅’β‰₯0)
 
 
 
 
 
 
104
 
105
+ def _proj_simplex(v):
106
  n = len(v)
107
  u = np.sort(v)[::-1]
108
  cs = np.cumsum(u) - 1.0
109
+ rho = int(np.where(u * np.arange(1, n+1) > cs)[0][-1])
110
  theta = cs[rho] / (rho + 1.0)
111
  return np.maximum(v - theta, 0.0)
112
 
113
+ def gen_simplex(n, dim, rng):
114
  data = []
115
  for _ in range(n):
116
+ A = np.ones(dim)
117
+ B = rng.uniform(-1.0, 3.0, dim)
118
  C = _proj_simplex(B)
119
  data.append({'A': A.tolist(), 'B': B.tolist(), 'C': C.tolist(), 'type': 'simplex'})
120
  return data
121
 
122
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
  # ── ASSEMBLY ──────────────────────────────────────────────────────────────────
124
 
125
+ SEEN_TYPES = {
126
  'box_proj': gen_box,
127
  'halfspace': gen_halfspace,
 
 
128
  'elastic': gen_elastic,
129
  }
130
+ OOD_TYPES = {
131
+ 'sphere': gen_sphere,
132
+ 'simplex': gen_simplex,
133
+ }
134
+ ALL_TYPES = {**SEEN_TYPES, **OOD_TYPES}
135
 
136
+
137
+ def generate_all(n_per_type=SAMPLES_PER_TYPE, dim=DIM, seed=42):
 
138
  rng = np.random.default_rng(seed)
139
  data = []
140
+ for fn in ALL_TYPES.values():
141
  data.extend(fn(n_per_type, dim, rng))
142
  idx = rng.permutation(len(data))
143
  return [data[i] for i in idx]
144
 
145
 
 
 
146
  if __name__ == '__main__':
147
+ parser = argparse.ArgumentParser()
148
+ parser.add_argument('--dim', type=int, default=DIM)
149
+ parser.add_argument('--n', type=int, default=SAMPLES_PER_TYPE)
150
+ parser.add_argument('--out', type=str, default='data')
151
  args = parser.parse_args()
152
 
153
+ print(f"\n{'─'*55}")
154
  print(f" Generating {5 * args.n} samples | dim={args.dim}")
155
+ print(f" SEEN : box_proj | halfspace | elastic")
156
+ print(f" OOD : sphere | simplex")
157
+ print(f"{'─'*55}")
158
+
159
+ rng = np.random.default_rng(42)
160
+
161
+ seen_data, ood_data = [], []
162
+ for t, fn in SEEN_TYPES.items():
163
+ seen_data.extend(fn(args.n, args.dim, rng))
164
+ for t, fn in OOD_TYPES.items():
165
+ ood_data.extend(fn(args.n, args.dim, rng))
166
+
167
+ # Shuffle within splits
168
+ si = rng.permutation(len(seen_data))
169
+ oi = rng.permutation(len(ood_data))
170
+ seen_data = [seen_data[i] for i in si]
171
+ ood_data = [ood_data[i] for i in oi]
172
+
173
+ # Train = 90% of SEEN only
174
+ # Test = 10% of SEEN + ALL OOD (so model never trained on OOD)
175
+ split = int(len(seen_data) * 0.9)
176
+ train = seen_data[:split]
177
+ test_seen = seen_data[split:]
178
+ test = test_seen + ood_data
179
+
180
+ # Re-shuffle test so seen/OOD are interleaved
181
+ ti = rng.permutation(len(test))
182
+ test = [test[i] for i in ti]
183
 
184
  out = pathlib.Path(args.out)
185
  out.mkdir(exist_ok=True)
186
  with open(out / 'train.json', 'w') as f: json.dump(train, f)
187
  with open(out / 'test.json', 'w') as f: json.dump(test, f)
188
 
 
189
  from collections import Counter
190
+ tr_types = Counter(d['type'] for d in train)
191
+ te_types = Counter(d['type'] for d in test)
192
+
193
+ print(f"\n {'Type':<14} {'Train':>7} {'Test':>7} {'Split'}")
194
+ print(f" {'─'*14} {'─'*7} {'─'*7} {'─'*10}")
195
+ for t in ALL_TYPES:
196
+ label = 'OOD βœ—' if t in OOD_TYPES else 'SEEN βœ“'
197
+ print(f" {t:<14} {tr_types.get(t,0):>7} {te_types.get(t,0):>7} {label}")
198
+ print(f"\n Total train={len(train)} test={len(test)}\n")
199
+
200
+ # Quick sanity: verify C is geometrically correct for first sample per type
201
+ print(f" Sanity check:")
202
+ seen_set = set()
203
+ for d in train + test:
204
+ t = d['type']
205
+ if t in seen_set: continue
206
+ seen_set.add(t)
 
 
207
  A, B, C = map(np.array, [d['A'], d['B'], d['C']])
208
+ print(f" [{t:<12}] β€–Aβ€–={np.linalg.norm(A):.2f} "
209
+ f"β€–Bβ€–={np.linalg.norm(B):.2f} β€–Cβ€–={np.linalg.norm(C):.2f}")
 
 
210
 
211
  print(f"\n Saved β†’ {out}/train.json {out}/test.json\n")