testing_space

Runtime error

App Files Files Community

everydaytok commited on 7 days ago

Commit

afe086f

verified ·

1 Parent(s): 387f0e3

Update data_gen.py

Browse files

Files changed (1) hide show

data_gen.py +115 -147

data_gen.py CHANGED Viewed

@@ -1,61 +1,43 @@
 """
 data_gen.py  —  Training / test data for the elastic mesh.
-Each sample is a triple (A, B, C) where:
-  A  ∈ ℝ^DIM   encodes constraints   ("what must be true")
-  B  ∈ ℝ^DIM   encodes objectives    ("what we want")
-  C  ∈ ℝ^DIM   is the analytic solution — the feasibility center the mesh must learn to produce
-Five problem families, each with a geometrically distinct C:
-  1. box_proj      — clamp B into axis-aligned box defined by A
-  2. halfspace     — project B onto hyperplane defined by A
-  3. sphere        — project B onto sphere surface defined by A
-  4. simplex       — project B onto probability simplex (A = uniform prior signal)
-  5. elastic_bal   — per-dimension weighted balance between A-center and B
-These cover:
-  - Bounded feasibility   (box)
-  - Equality constraints  (halfspace)
-  - Norm constraints      (sphere)
-  - Probability/sum=1     (simplex)
-  - Soft trade-offs       (elastic)
-The mesh sees ONLY (A, B) during inference; C is what it must reconstruct.
 """
 import numpy as np
 import json, pathlib, argparse
 from typing import List, Dict
-DIM              = 32    # embedding dimension  (set to 768 for LLM-scale)
-SAMPLES_PER_TYPE = 1000  # × 5 types = 5 000 total
 # ── UTILITIES ─────────────────────────────────────────────────────────────────
-def normalize(v: np.ndarray) -> np.ndarray:
-    n = np.linalg.norm(v)
-    return v / (n + 1e-12)
-def pack(*arrays: np.ndarray, dim: int) -> np.ndarray:
-    """Concatenate + trim/pad to `dim`."""
     v = np.concatenate(arrays)
-    if len(v) >= dim:
-        return v[:dim]
-    return np.pad(v, (0, dim - len(v)))
-# ── PROBLEM TYPE 1: BOX PROJECTION ────────────────────────────────────────────
-#
-# Constraint  A : encodes per-dimension box  [lo, hi]
-#               A[:D/2] = lo[:D/2],  A[D/2:] = hi[:D/2]
-# Objective   B : unconstrained target point in ℝ^D
-# Solution    C : clip(B, lo, hi)   — nearest point in box to B
-#
-# Meaning: "stay within resource/capacity bounds while aiming for B"
-def gen_box(n: int, dim: int, rng: np.random.Generator) -> List[Dict]:
     data = []
     for _ in range(n):
         center = rng.uniform(-2, 2, dim)
@@ -68,40 +50,40 @@ def gen_box(n: int, dim: int, rng: np.random.Generator) -> List[Dict]:
     return data
-# ── PROBLEM TYPE 2: HALFSPACE PROJECTION ──────────────────────────────────────
-#
-# Constraint  A : encodes a hyperplane  nᵀx = b
-#               A = normal vector, A[0] carries the offset b
-# Objective   B : unconstrained point in ℝ^D
-# Solution    C : projection of B onto the hyperplane
-#               C = B − (nᵀB − b) · n
-#
-# Meaning: "satisfy one hard equality constraint at minimum cost to B"
-def gen_halfspace(n: int, dim: int, rng: np.random.Generator) -> List[Dict]:
     data = []
     for _ in range(n):
-        normal = normalize(rng.standard_normal(dim))
         b      = float(rng.uniform(-1, 1))
         B      = rng.uniform(-3, 3, dim)
         C      = B - (float(np.dot(normal, B)) - b) * normal
-        A      = normal.copy()
-        A[0]   = b          # offset embedded in first slot
         data.append({'A': A.tolist(), 'B': B.tolist(), 'C': C.tolist(), 'type': 'halfspace'})
     return data
-# ── PROBLEM TYPE 3: SPHERE SURFACE ────────────────────────────────────────────
-#
-# Constraint  A : encodes a sphere (center, radius)
-#               A = center vector, A[0] overwritten with radius r
-# Objective   B : external point
-# Solution    C : point on sphere surface nearest to B
-#               C = center + r · (B − center) / ‖B − center‖
-#
-# Meaning: "satisfy a norm/budget constraint, move toward B as far as allowed"
-def gen_sphere(n: int, dim: int, rng: np.random.Generator) -> List[Dict]:
     data = []
     for _ in range(n):
         center = rng.uniform(-1.5, 1.5, dim)
@@ -110,134 +92,120 @@ def gen_sphere(n: int, dim: int, rng: np.random.Generator) -> List[Dict]:
         diff   = B - center
         nd     = np.linalg.norm(diff)
         if nd < 1e-10:
-            diff = np.ones(dim) / np.sqrt(dim)
-            nd   = 1.0
         C    = center + r * diff / nd
-        A    = center.copy()
-        A[0] = r          # radius in first slot
         data.append({'A': A.tolist(), 'B': B.tolist(), 'C': C.tolist(), 'type': 'sphere'})
     return data
-# ── PROBLEM TYPE 4: SIMPLEX PROJECTION ────────────────────────────────────────
-#
-# Constraint  A : uniform-prior signal (all ones) → encodes simplex constraint Σxᵢ=1, xᵢ≥0
-# Objective   B : unconstrained "belief" vector
-# Solution    C : nearest point on probability simplex to B
-#
-# Meaning: "find a valid probability distribution closest to unconstrained belief B"
-# Useful for softmax-like problems.
-def _proj_simplex(v: np.ndarray) -> np.ndarray:
     n  = len(v)
     u  = np.sort(v)[::-1]
     cs = np.cumsum(u) - 1.0
-    rho = int(np.where(u * np.arange(1, n + 1) > cs)[0][-1])
     theta = cs[rho] / (rho + 1.0)
     return np.maximum(v - theta, 0.0)
-def gen_simplex(n: int, dim: int, rng: np.random.Generator) -> List[Dict]:
     data = []
     for _ in range(n):
-        A = np.ones(dim)                     # simplex constraint signal
-        B = rng.uniform(-1.0, 3.0, dim)      # unconstrained belief
         C = _proj_simplex(B)
         data.append({'A': A.tolist(), 'B': B.tolist(), 'C': C.tolist(), 'type': 'simplex'})
     return data
-# ── PROBLEM TYPE 5: ELASTIC BALANCE ───────────────────────────────────────────
-#
-# Constraint  A : encodes soft constraint center + per-dimension tightness weight w ∈ [0,1]
-#               A[:D/2] = constraint centers,  A[D/2:] = tightness weights
-# Objective   B : desired goal point
-# Solution    C : per-dimension elastic balance
-#               C[j] = w[j] · a_center[j] + (1 − w[j]) · B[j]
-#
-# Meaning: "each dimension is pulled between constraint center and objective,
-#           with w[j] controlling how hard the constraint is in that dimension"
-# This is the natural problem for the elastic mesh.
-def gen_elastic(n: int, dim: int, rng: np.random.Generator) -> List[Dict]:
-    data = []
-    for _ in range(n):
-        a_center = rng.uniform(-2, 2, dim)
-        w        = rng.uniform(0.05, 0.95, dim)   # per-dim tightness
-        B        = rng.uniform(-3, 3, dim)
-        C        = w * a_center + (1.0 - w) * B
-        A        = pack(a_center[:dim//2], w[:dim//2], dim=dim)
-        data.append({'A': A.tolist(), 'B': B.tolist(), 'C': C.tolist(), 'type': 'elastic'})
-    return data
 # ── ASSEMBLY ──────────────────────────────────────────────────────────────────
-GENERATORS = {
     'box_proj':  gen_box,
     'halfspace': gen_halfspace,
-    'sphere':    gen_sphere,
-    'simplex':   gen_simplex,
     'elastic':   gen_elastic,
 }
-def generate_all(n_per_type: int = SAMPLES_PER_TYPE,
-                 dim: int       = DIM,
-                 seed: int      = 42) -> List[Dict]:
     rng  = np.random.default_rng(seed)
     data = []
-    for fn in GENERATORS.values():
         data.extend(fn(n_per_type, dim, rng))
     idx = rng.permutation(len(data))
     return [data[i] for i in idx]
-# ── MAIN ──────────────────────────────────────────────────────────────────────
 if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='Generate elastic mesh training data')
-    parser.add_argument('--dim', type=int, default=DIM,              help='embedding dimension')
-    parser.add_argument('--n',   type=int, default=SAMPLES_PER_TYPE, help='samples per problem type')
-    parser.add_argument('--out', type=str, default='data',           help='output directory')
     args = parser.parse_args()
-    print(f"\n{'─'*50}")
     print(f"  Generating {5 * args.n} samples  |  dim={args.dim}")
-    print(f"{'─'*50}")
-    data  = generate_all(args.n, args.dim)
-    split = int(len(data) * 0.9)
-    train, test = data[:split], data[split:]
     out = pathlib.Path(args.out)
     out.mkdir(exist_ok=True)
     with open(out / 'train.json', 'w') as f: json.dump(train, f)
     with open(out / 'test.json',  'w') as f: json.dump(test,  f)
-    # Per-type statistics
     from collections import Counter
-    train_types = Counter(d['type'] for d in train)
-    test_types  = Counter(d['type'] for d in test)
-    print(f"\n  Train : {len(train)}")
-    print(f"  Test  : {len(test)}\n")
-    print(f"  {'Type':<14} {'Train':>8} {'Test':>7}  C-norm (mean)")
-    print(f"  {'─'*14} {'─'*8} {'─'*7}  {'─'*14}")
-    for t in GENERATORS:
-        subset = [d for d in data if d['type'] == t]
-        norms  = [np.linalg.norm(d['C']) for d in subset]
-        print(f"  {t:<14} {train_types[t]:>8} {test_types[t]:>7}  "
-              f"{np.mean(norms):.3f} ± {np.std(norms):.3f}")
-    # Sanity check one sample per type
-    print(f"\n  Sanity check (first sample per type):")
-    seen = set()
-    for d in data:
-        if d['type'] in seen: continue
-        seen.add(d['type'])
         A, B, C = map(np.array, [d['A'], d['B'], d['C']])
-        err = np.linalg.norm(A - B)
-        print(f"  [{d['type']:<12}]  "
-              f"‖A‖={np.linalg.norm(A):.2f}  ‖B‖={np.linalg.norm(B):.2f}  "
-              f"‖C‖={np.linalg.norm(C):.2f}  ‖A-B‖={err:.2f}")
     print(f"\n  Saved → {out}/train.json  {out}/test.json\n")

 """
 data_gen.py  —  Training / test data for the elastic mesh.
+OOD TEST DESIGN
+───────────────
+  SEEN during training   : box_proj | halfspace | elastic
+  UNSEEN (OOD) at test   : sphere   | simplex
+This lets us distinguish:
+  • Memorisation  → high acc on seen, low acc on unseen
+  • Geometry      → high acc on both  (the real claim)
+Each sample: (A, B, C)  where A=constraints, B=objectives, C=feasibility center.
+DIM = 64  (double from previous run, stress-tests before LLM scale).
 """
 import numpy as np
 import json, pathlib, argparse
 from typing import List, Dict
+DIM              = 64
+SAMPLES_PER_TYPE = 1000   # × 5 types = 5 000 total
 # ── UTILITIES ─────────────────────────────────────────────────────────────────
+def norm(v: np.ndarray) -> np.ndarray:
+    return v / (np.linalg.norm(v) + 1e-12)
+def pack(*arrays, dim):
     v = np.concatenate(arrays)
+    return v[:dim] if len(v) >= dim else np.pad(v, (0, dim - len(v)))
+# ── PROBLEM TYPE 1 (SEEN): BOX PROJECTION ────────────────────────────────────
+# C = clip(B, lo, hi)
+# A encodes the box bounds
+def gen_box(n, dim, rng):
     data = []
     for _ in range(n):
         center = rng.uniform(-2, 2, dim)
     return data
+# ── PROBLEM TYPE 2 (SEEN): HALFSPACE PROJECTION ───────────────────────────────
+# C = B − (nᵀB − b)·n   (project B onto hyperplane nᵀx = b)
+def gen_halfspace(n, dim, rng):
     data = []
     for _ in range(n):
+        normal = norm(rng.standard_normal(dim))
         b      = float(rng.uniform(-1, 1))
         B      = rng.uniform(-3, 3, dim)
         C      = B - (float(np.dot(normal, B)) - b) * normal
+        A      = normal.copy(); A[0] = b
         data.append({'A': A.tolist(), 'B': B.tolist(), 'C': C.tolist(), 'type': 'halfspace'})
     return data
+# ── PROBLEM TYPE 3 (SEEN): ELASTIC BALANCE ────────────────────────────────────
+# C[j] = w[j]·a_center[j] + (1−w[j])·B[j]   per-dimension soft trade-off
+def gen_elastic(n, dim, rng):
+    data = []
+    for _ in range(n):
+        a_center = rng.uniform(-2, 2, dim)
+        w        = rng.uniform(0.05, 0.95, dim)
+        B        = rng.uniform(-3, 3, dim)
+        C        = w * a_center + (1.0 - w) * B
+        A        = pack(a_center[:dim//2], w[:dim//2], dim=dim)
+        data.append({'A': A.tolist(), 'B': B.tolist(), 'C': C.tolist(), 'type': 'elastic'})
+    return data
+# ── PROBLEM TYPE 4 (OOD): SPHERE SURFACE ─────────────────────────────────────
+# C = center + r·(B−center)/‖B−center‖   (nearest point on sphere to B)
+def gen_sphere(n, dim, rng):
     data = []
     for _ in range(n):
         center = rng.uniform(-1.5, 1.5, dim)
         diff   = B - center
         nd     = np.linalg.norm(diff)
         if nd < 1e-10:
+            diff = np.ones(dim) / np.sqrt(dim); nd = 1.0
         C    = center + r * diff / nd
+        A    = center.copy(); A[0] = r
         data.append({'A': A.tolist(), 'B': B.tolist(), 'C': C.tolist(), 'type': 'sphere'})
     return data
+# ── PROBLEM TYPE 5 (OOD): SIMPLEX PROJECTION ─────────────────────────────────
+# C = nearest point on probability simplex to B  (Σxᵢ=1, xᵢ≥0)
+def _proj_simplex(v):
     n  = len(v)
     u  = np.sort(v)[::-1]
     cs = np.cumsum(u) - 1.0
+    rho = int(np.where(u * np.arange(1, n+1) > cs)[0][-1])
     theta = cs[rho] / (rho + 1.0)
     return np.maximum(v - theta, 0.0)
+def gen_simplex(n, dim, rng):
     data = []
     for _ in range(n):
+        A = np.ones(dim)
+        B = rng.uniform(-1.0, 3.0, dim)
         C = _proj_simplex(B)
         data.append({'A': A.tolist(), 'B': B.tolist(), 'C': C.tolist(), 'type': 'simplex'})
     return data
 # ── ASSEMBLY ──────────────────────────────────────────────────────────────────
+SEEN_TYPES = {
     'box_proj':  gen_box,
     'halfspace': gen_halfspace,
     'elastic':   gen_elastic,
 }
+OOD_TYPES = {
+    'sphere':  gen_sphere,
+    'simplex': gen_simplex,
+}
+ALL_TYPES = {**SEEN_TYPES, **OOD_TYPES}
+def generate_all(n_per_type=SAMPLES_PER_TYPE, dim=DIM, seed=42):
     rng  = np.random.default_rng(seed)
     data = []
+    for fn in ALL_TYPES.values():
         data.extend(fn(n_per_type, dim, rng))
     idx = rng.permutation(len(data))
     return [data[i] for i in idx]
 if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--dim', type=int, default=DIM)
+    parser.add_argument('--n',   type=int, default=SAMPLES_PER_TYPE)
+    parser.add_argument('--out', type=str, default='data')
     args = parser.parse_args()
+    print(f"\n{'─'*55}")
     print(f"  Generating {5 * args.n} samples  |  dim={args.dim}")
+    print(f"  SEEN  : box_proj | halfspace | elastic")
+    print(f"  OOD   : sphere   | simplex")
+    print(f"{'─'*55}")
+    rng = np.random.default_rng(42)
+    seen_data, ood_data = [], []
+    for t, fn in SEEN_TYPES.items():
+        seen_data.extend(fn(args.n, args.dim, rng))
+    for t, fn in OOD_TYPES.items():
+        ood_data.extend(fn(args.n, args.dim, rng))
+    # Shuffle within splits
+    si = rng.permutation(len(seen_data))
+    oi = rng.permutation(len(ood_data))
+    seen_data = [seen_data[i] for i in si]
+    ood_data  = [ood_data[i]  for i in oi]
+    # Train = 90% of SEEN only
+    # Test  = 10% of SEEN  +  ALL OOD  (so model never trained on OOD)
+    split    = int(len(seen_data) * 0.9)
+    train    = seen_data[:split]
+    test_seen = seen_data[split:]
+    test     = test_seen + ood_data
+    # Re-shuffle test so seen/OOD are interleaved
+    ti = rng.permutation(len(test))
+    test = [test[i] for i in ti]
     out = pathlib.Path(args.out)
     out.mkdir(exist_ok=True)
     with open(out / 'train.json', 'w') as f: json.dump(train, f)
     with open(out / 'test.json',  'w') as f: json.dump(test,  f)
     from collections import Counter
+    tr_types = Counter(d['type'] for d in train)
+    te_types = Counter(d['type'] for d in test)
+    print(f"\n  {'Type':<14} {'Train':>7} {'Test':>7}  {'Split'}")
+    print(f"  {'─'*14} {'─'*7} {'─'*7}  {'─'*10}")
+    for t in ALL_TYPES:
+        label = 'OOD ✗' if t in OOD_TYPES else 'SEEN ✓'
+        print(f"  {t:<14} {tr_types.get(t,0):>7} {te_types.get(t,0):>7}  {label}")
+    print(f"\n  Total  train={len(train)}  test={len(test)}\n")
+    # Quick sanity: verify C is geometrically correct for first sample per type
+    print(f"  Sanity check:")
+    seen_set = set()
+    for d in train + test:
+        t = d['type']
+        if t in seen_set: continue
+        seen_set.add(t)
         A, B, C = map(np.array, [d['A'], d['B'], d['C']])
+        print(f"  [{t:<12}]  ‖A‖={np.linalg.norm(A):.2f}  "
+              f"‖B‖={np.linalg.norm(B):.2f}  ‖C‖={np.linalg.norm(C):.2f}")
     print(f"\n  Saved → {out}/train.json  {out}/test.json\n")