| """
|
| H4 Polytopic Attention — CPU autoresearch training script.
|
| This is the ONLY file the agent modifies during autonomous research.
|
|
|
| Follows the autoresearch pattern: modify → run (2 min budget) → measure → keep/discard.
|
|
|
| The frozen H4 geometry is off-limits. Only the trainable adapters, hyperparameters,
|
| training loop details, and architecture of trainable layers may be changed.
|
| """
|
|
|
| import os
|
| import math
|
| import time
|
| import json
|
| import torch
|
| import torch.nn as nn
|
| import torch.nn.functional as F
|
| import numpy as np
|
|
|
| import sys
|
| sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
|
|
| from h4_polytopic_attention import generate_600_cell_vertices, build_coxeter_chambers
|
| from h4_language_model import H4LanguageModel
|
| from bitlinear import BitLinear
|
|
|
|
|
|
|
|
|
|
|
| PHI = (1 + math.sqrt(5)) / 2
|
|
|
|
|
| VERTICES = torch.tensor(generate_600_cell_vertices(), dtype=torch.float32)
|
| CHAMBERS = build_coxeter_chambers(VERTICES.numpy())
|
| SIMPLE_ROOTS = torch.tensor(CHAMBERS['simple_roots'], dtype=torch.float32)
|
|
|
|
|
|
|
|
|
|
|
|
|
| TIME_BUDGET = 120
|
|
|
|
|
| DATASET = 'synthetic'
|
|
|
|
|
| MAX_SEQ_LEN = 128
|
| BATCH_SIZE = 8
|
|
|
|
|
| D_MODEL = 256
|
| N_HEADS = 8
|
| N_LAYERS = 4
|
| D_VALUE = 16
|
| D_FFN = 512
|
| TOP_K = 16
|
| DROPOUT = 0.0
|
| USE_BITLINEAR = True
|
|
|
|
|
| LR = 5e-3
|
| WEIGHT_DECAY = 0.01
|
| WARMUP_STEPS = 50
|
| GRAD_CLIP = 1.0
|
|
|
|
|
| EVAL_INTERVAL = 25
|
| EVAL_BATCHES = 5
|
|
|
|
|
|
|
|
|
|
|
| def load_text_data():
|
| """Load training text. Falls back to synthetic data if no file available."""
|
|
|
| data_paths = [
|
| os.path.join(os.path.dirname(__file__), '..', 'data', 'shakespeare.txt'),
|
| os.path.join(os.path.dirname(__file__), '..', 'data', 'input.txt'),
|
| os.path.join(os.path.dirname(__file__), 'data', 'input.txt'),
|
| ]
|
|
|
| text = None
|
| for path in data_paths:
|
| if os.path.exists(path):
|
| with open(path, 'r', encoding='utf-8') as f:
|
| text = f.read()
|
| print(f"Loaded data from {path} ({len(text)} chars)")
|
| break
|
|
|
| if text is None:
|
|
|
|
|
| print("No data file found, generating synthetic text...")
|
| base_phrases = [
|
| "the golden ratio appears in nature ",
|
| "fibonacci numbers grow exponentially ",
|
| "symmetry underlies all of physics ",
|
| "the icosahedron has twenty faces ",
|
| "phi equals one plus one over phi ",
|
| "geometry is the language of space ",
|
| "five fold symmetry cannot tile a plane ",
|
| "the dodecahedron has twelve faces ",
|
| ]
|
|
|
| text = ""
|
| a, b = 1, 1
|
| for _ in range(200):
|
| phrase = base_phrases[a % len(base_phrases)]
|
| text += phrase * (b % 3 + 1)
|
| a, b = b, a + b
|
|
|
| return text
|
|
|
|
|
| def prepare_char_dataset(text: str):
|
| """Prepare character-level dataset from text."""
|
| chars = sorted(list(set(text)))
|
| vocab_size = len(chars)
|
| stoi = {ch: i for i, ch in enumerate(chars)}
|
| itos = {i: ch for ch, i in stoi.items()}
|
|
|
| data = torch.tensor([stoi[c] for c in text], dtype=torch.long)
|
|
|
|
|
| n = int(0.9 * len(data))
|
| train_data = data[:n]
|
| val_data = data[n:]
|
|
|
| return train_data, val_data, vocab_size, stoi, itos
|
|
|
|
|
| def get_batch(data: torch.Tensor, batch_size: int, seq_len: int):
|
| """Sample a random batch of sequences."""
|
| max_start = len(data) - seq_len - 1
|
| if max_start <= 0:
|
| max_start = 1
|
| ix = torch.randint(0, max_start, (batch_size,))
|
| x = torch.stack([data[i:i + seq_len] for i in ix])
|
| y = torch.stack([data[i + 1:i + seq_len + 1] for i in ix])
|
| return x, y
|
|
|
|
|
|
|
|
|
|
|
|
|
| def main():
|
| t_start = time.time()
|
| torch.manual_seed(42)
|
| np.random.seed(42)
|
|
|
|
|
| if DATASET != 'synthetic':
|
| from prepare_data import load_and_prepare
|
| train_data, val_data, vocab_size, stoi, itos = load_and_prepare(DATASET)
|
| else:
|
| text = load_text_data()
|
| train_data, val_data, vocab_size, stoi, itos = prepare_char_dataset(text)
|
| print(f"Vocab size: {vocab_size}, Train: {len(train_data)}, Val: {len(val_data)}")
|
|
|
|
|
| model = H4LanguageModel(
|
| vocab_size=vocab_size,
|
| d_model=D_MODEL,
|
| n_heads=N_HEADS,
|
| n_layers=N_LAYERS,
|
| d_value=D_VALUE,
|
| d_ffn=D_FFN,
|
| top_k=TOP_K,
|
| max_seq_len=MAX_SEQ_LEN * 2,
|
| dropout=DROPOUT,
|
| use_bitlinear=USE_BITLINEAR,
|
| )
|
|
|
| param_info = model.count_params()
|
| print(f"Model params: {param_info['trainable']:,} trainable, {param_info['buffers']:,} buffer elements")
|
|
|
|
|
| optimizer = torch.optim.AdamW(
|
| model.parameters(),
|
| lr=LR,
|
| weight_decay=WEIGHT_DECAY,
|
| betas=(0.9, 0.95),
|
| )
|
|
|
|
|
| def lr_schedule(step):
|
| if step < WARMUP_STEPS:
|
| return step / max(WARMUP_STEPS, 1)
|
|
|
| progress = (step - WARMUP_STEPS) / max(1, 500 - WARMUP_STEPS)
|
| return 0.1 + 0.9 * 0.5 * (1 + math.cos(math.pi * min(progress, 1.0)))
|
|
|
| scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_schedule)
|
|
|
|
|
| step = 0
|
| total_training_time = 0.0
|
| best_val_loss = float('inf')
|
| train_losses = []
|
| val_losses = []
|
|
|
|
|
|
|
| use_tree = MAX_SEQ_LEN > 256
|
|
|
| print(f"\nTraining for {TIME_BUDGET}s budget, seq_len={MAX_SEQ_LEN}, use_tree={use_tree}")
|
| print(f"{'step':>6} {'loss':>8} {'val_loss':>8} {'lr':>10} {'dt':>6} {'progress':>8}")
|
| print("-" * 56)
|
|
|
| model.train()
|
|
|
| while True:
|
| t0 = time.time()
|
|
|
|
|
| x, y = get_batch(train_data, BATCH_SIZE, MAX_SEQ_LEN)
|
|
|
|
|
| logits = model(x, use_tree=use_tree)
|
| loss = F.cross_entropy(logits.view(-1, vocab_size), y.view(-1))
|
|
|
|
|
| optimizer.zero_grad()
|
| loss.backward()
|
|
|
|
|
| if GRAD_CLIP > 0:
|
| torch.nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP)
|
|
|
| optimizer.step()
|
| scheduler.step()
|
|
|
| dt = time.time() - t0
|
| if step > 2:
|
| total_training_time += dt
|
|
|
| train_losses.append(loss.item())
|
|
|
|
|
| val_loss = None
|
| if step % EVAL_INTERVAL == 0:
|
| model.eval()
|
| with torch.no_grad():
|
| vl = []
|
| for _ in range(EVAL_BATCHES):
|
| xv, yv = get_batch(val_data, BATCH_SIZE, MAX_SEQ_LEN)
|
| vlogits = model(xv, use_tree=False)
|
| vl.append(F.cross_entropy(vlogits.view(-1, vocab_size), yv.view(-1)).item())
|
| val_loss = sum(vl) / len(vl)
|
| val_losses.append(val_loss)
|
|
|
| if val_loss < best_val_loss:
|
| best_val_loss = val_loss
|
|
|
| current_lr = scheduler.get_last_lr()[0]
|
| progress = min(total_training_time / TIME_BUDGET, 1.0)
|
| print(f"{step:6d} {loss.item():8.4f} {val_loss:8.4f} {current_lr:10.6f} {dt:6.3f} {progress:7.1%}")
|
| model.train()
|
|
|
| step += 1
|
| if step > 2 and total_training_time >= TIME_BUDGET:
|
| break
|
|
|
|
|
|
|
|
|
|
|
| model.eval()
|
| with torch.no_grad():
|
|
|
| vl = []
|
| for _ in range(EVAL_BATCHES * 4):
|
| xv, yv = get_batch(val_data, BATCH_SIZE, MAX_SEQ_LEN)
|
| vlogits = model(xv, use_tree=False)
|
| vl.append(F.cross_entropy(vlogits.view(-1, vocab_size), yv.view(-1)).item())
|
| final_val_loss = sum(vl) / len(vl)
|
|
|
|
|
| val_bpb = final_val_loss / math.log(2)
|
|
|
|
|
| xd, _ = get_batch(val_data, 1, MAX_SEQ_LEN)
|
| _, diag_list = model(xd, use_tree=False, return_diagnostics=True)
|
|
|
|
|
| avg_chamber_entropy = np.mean([d['chamber_entropy'] for d in diag_list])
|
| nudge_ranks = []
|
| geo_aligns = []
|
| for d in diag_list:
|
| nudge_ranks.extend(d['nudge_rank'])
|
| geo_aligns.extend(d['geo_alignment'])
|
| avg_nudge_rank = np.mean([r for r in nudge_ranks if r != float('inf')] or [0])
|
| avg_geo_alignment = np.mean(geo_aligns)
|
|
|
|
|
| seed_text = list(stoi.keys())[:4]
|
| seed_ids = torch.tensor([[stoi[c] for c in seed_text]], dtype=torch.long)
|
| generated = model.generate(seed_ids, max_new_tokens=80, temperature=0.8, top_k_sample=10)
|
| gen_text = ''.join([itos.get(i.item(), '?') for i in generated[0]])
|
|
|
|
|
|
|
|
|
|
|
|
|
| has_bitlinear = any(isinstance(m, BitLinear) for m in model.modules())
|
| ternary_info = {}
|
| if has_bitlinear:
|
| from ternary_diagnostics import chamber_preservation, bitlinear_layer_stats, size_comparison
|
| cp = chamber_preservation(model)
|
| mean_cp = sum(cp.values()) / len(cp) if cp else 0.0
|
| bl_stats = bitlinear_layer_stats(model)
|
| mean_zero_pct = np.mean([s['zero'] for s in bl_stats.values()]) if bl_stats else 0.0
|
| sz = size_comparison(model)
|
| ternary_info = {
|
| 'chamber_preserve': mean_cp,
|
| 'mean_zero_pct': mean_zero_pct,
|
| 'compression': sz['compression'],
|
| 'mixed_kb': sz['mixed_kb'],
|
| }
|
|
|
| print("\n" + "=" * 60)
|
| print("GENERATED SAMPLE:")
|
| print(gen_text[:200])
|
| print("=" * 60)
|
|
|
| print("\n---")
|
| print(f"val_bpb: {val_bpb:.6f}")
|
| print(f"val_loss: {final_val_loss:.6f}")
|
| print(f"best_val_loss: {best_val_loss:.6f}")
|
| print(f"chamber_entropy: {avg_chamber_entropy:.4f}")
|
| print(f"avg_nudge_rank: {avg_nudge_rank:.4f}")
|
| print(f"avg_geo_alignment: {avg_geo_alignment:.4f}")
|
| print(f"training_seconds: {total_training_time:.1f}")
|
| print(f"total_seconds: {time.time() - t_start:.1f}")
|
| print(f"peak_memory_mb: {0:.1f}")
|
| print(f"num_steps: {step}")
|
| print(f"num_params: {param_info['trainable']}")
|
| print(f"vocab_size: {vocab_size}")
|
| print(f"seq_len: {MAX_SEQ_LEN}")
|
| print(f"ternary: {'yes' if USE_BITLINEAR else 'no'}")
|
| if ternary_info:
|
| print(f"chamber_preserve: {ternary_info['chamber_preserve']:.4f}")
|
| print(f"mean_zero_pct: {ternary_info['mean_zero_pct']:.4f}")
|
| print(f"compression: {ternary_info['compression']:.1f}x")
|
| print(f"model_size_kb: {ternary_info['mixed_kb']:.1f}")
|
|
|
|
|
| if __name__ == '__main__':
|
| main()
|
|
|