| """ |
| NeuroLex v4 — Quick Test & Validation Script |
| ============================================= |
| Run this to verify everything works before training. |
| Works on CPU (no GPU needed for testing). |
| |
| Usage: |
| python test_model.py |
| """ |
|
|
| import torch |
| import sys |
| import time |
|
|
| def test_imports(): |
| """Test all imports work.""" |
| print("Testing imports...", end=" ") |
| try: |
| from neurolex_v4_model import ( |
| NeuroLexV4, NeuroLexConfig, CharTokenizer, create_model, |
| AdaptiveLayerNorm, DiTBlock, TimeEmbedding, |
| DOMAINS, STYLES, LANGUAGES, DOMAIN_TO_ID, STYLE_TO_ID, LANG_TO_ID |
| ) |
| from neurolex_v4_dataset import ( |
| create_dataloaders, NeuroLexDataset, StreamingNeuroLexDataset, |
| LANGUAGE_WORDS, DOMAIN_NAMES, generate_blended_name, |
| generate_augmented_dataset, PREFIXES, SUFFIXES |
| ) |
| print("✅ All imports successful") |
| return True |
| except Exception as e: |
| print(f"❌ Import error: {e}") |
| return False |
|
|
|
|
| def test_tokenizer(): |
| """Test CharTokenizer encode/decode.""" |
| print("Testing tokenizer...", end=" ") |
| from neurolex_v4_model import CharTokenizer |
| |
| tok = CharTokenizer() |
| |
| |
| test_names = ['Nexora', 'FluxByte', 'luminara', 'Kaze-Flow', 'X42'] |
| for name in test_names: |
| encoded = tok.encode(name, max_len=24) |
| decoded = tok.decode(encoded) |
| assert decoded == name, f"Roundtrip failed: '{name}' → {encoded} → '{decoded}'" |
| |
| |
| batch = tok.batch_encode(test_names, max_len=24) |
| assert batch.shape == (5, 24), f"Batch shape wrong: {batch.shape}" |
| |
| decoded_batch = tok.batch_decode(batch) |
| for orig, dec in zip(test_names, decoded_batch): |
| assert orig == dec, f"Batch roundtrip failed: '{orig}' → '{dec}'" |
| |
| print(f"✅ Tokenizer works (vocab_size={tok.vocab_size})") |
| return True |
|
|
|
|
| def test_model_forward(): |
| """Test model forward pass with correct shapes.""" |
| print("Testing model forward pass...", end=" ") |
| from neurolex_v4_model import NeuroLexV4, NeuroLexConfig, CharTokenizer |
| |
| config = NeuroLexConfig( |
| d_model=64, n_heads=4, n_layers=2, d_ff=128 |
| ) |
| model = NeuroLexV4(config) |
| tok = CharTokenizer() |
| |
| |
| B, L = 4, 24 |
| x = tok.batch_encode(['Nexora', 'FluxByte', 'Kaze', 'Luminara'], max_len=L) |
| t = torch.rand(B) |
| domain_id = torch.randint(0, config.n_domains, (B,)) |
| style_id = torch.randint(0, config.n_styles, (B,)) |
| lang_id = torch.randint(0, config.n_languages, (B,)) |
| length_id = torch.randint(0, config.n_lengths, (B,)) |
| |
| |
| logits = model(x, t, domain_id, style_id, lang_id, length_id) |
| assert logits.shape == (B, L, config.vocab_size), f"Output shape wrong: {logits.shape}" |
| |
| |
| cfg_mask = torch.ones(B, dtype=torch.bool) |
| logits_uncond = model(x, t, domain_id, style_id, lang_id, length_id, cfg_mask=cfg_mask) |
| assert logits_uncond.shape == (B, L, config.vocab_size) |
| |
| |
| diff = (logits - logits_uncond).abs().sum() |
| assert diff > 0, "CFG mask should produce different logits" |
| |
| print(f"✅ Forward pass works (output: {logits.shape})") |
| return True |
|
|
|
|
| def test_loss_computation(): |
| """Test UDLM loss computation.""" |
| print("Testing loss computation...", end=" ") |
| from neurolex_v4_model import NeuroLexV4, NeuroLexConfig, CharTokenizer |
| |
| config = NeuroLexConfig(d_model=64, n_heads=4, n_layers=2, d_ff=128) |
| model = NeuroLexV4(config) |
| tok = CharTokenizer() |
| |
| B = 8 |
| x = tok.batch_encode(['Test' + str(i) for i in range(B)], max_len=24) |
| domain_id = torch.randint(0, config.n_domains, (B,)) |
| style_id = torch.randint(0, config.n_styles, (B,)) |
| lang_id = torch.randint(0, config.n_languages, (B,)) |
| length_id = torch.randint(0, config.n_lengths, (B,)) |
| |
| |
| loss = model.compute_loss(x, domain_id, style_id, lang_id, length_id) |
| |
| assert loss.dim() == 0, f"Loss should be scalar, got shape {loss.shape}" |
| assert loss.item() > 0, f"Loss should be positive, got {loss.item()}" |
| assert not torch.isnan(loss), "Loss is NaN!" |
| assert not torch.isinf(loss), "Loss is Inf!" |
| |
| |
| loss.backward() |
| |
| |
| has_grad = sum(1 for p in model.parameters() if p.grad is not None) |
| total_params = sum(1 for p in model.parameters()) |
| assert has_grad == total_params, f"Only {has_grad}/{total_params} params have gradients" |
| |
| print(f"✅ Loss works (loss={loss.item():.4f}, grads OK)") |
| return True |
|
|
|
|
| def test_generation(): |
| """Test generation pipeline.""" |
| print("Testing generation...", end=" ") |
| from neurolex_v4_model import NeuroLexV4, NeuroLexConfig, DOMAIN_TO_ID, STYLE_TO_ID, LANG_TO_ID |
| |
| config = NeuroLexConfig(d_model=64, n_heads=4, n_layers=2, d_ff=128) |
| model = NeuroLexV4(config) |
| model.eval() |
| |
| |
| names = model.generate( |
| domain_id=DOMAIN_TO_ID['tech'], |
| style_id=STYLE_TO_ID['sharp'], |
| lang_id=LANG_TO_ID['english'], |
| target_length=7, |
| batch_size=8, |
| cfg_scale=2.0, |
| temperature=0.9, |
| n_steps=10, |
| odd_alpha=4.0, |
| device='cpu' |
| ) |
| |
| assert len(names) > 0, "No names generated!" |
| assert all(isinstance(n, str) for n in names), "Names should be strings" |
| assert all(len(n) >= 3 for n in names), f"Names too short: {names}" |
| |
| |
| unique = set(n.lower() for n in names) |
| |
| print(f"✅ Generation works ({len(names)} names, {len(unique)} unique)") |
| print(f" Sample: {names[:5]}") |
| return True |
|
|
|
|
| def test_dataset(): |
| """Test dataset generation.""" |
| print("Testing dataset...", end=" ") |
| from neurolex_v4_dataset import ( |
| NeuroLexDataset, StreamingNeuroLexDataset, |
| generate_augmented_dataset, LANGUAGE_WORDS, DOMAIN_NAMES |
| ) |
| |
| |
| data = generate_augmented_dataset(n_samples=1000, seed=42) |
| assert len(data) >= 1000, f"Dataset too small: {len(data)}" |
| |
| |
| sample = data[0] |
| assert 'name' in sample, "Missing 'name' field" |
| assert 'domain' in sample, "Missing 'domain' field" |
| assert 'style' in sample, "Missing 'style' field" |
| assert 'language' in sample, "Missing 'language' field" |
| assert 'length' in sample, "Missing 'length' field" |
| |
| |
| ds = NeuroLexDataset(n_samples=500, seed=42) |
| item = ds[0] |
| assert item['input_ids'].shape == (24,), f"Wrong shape: {item['input_ids'].shape}" |
| assert item['domain'].dim() == 0, "Domain should be scalar" |
| |
| |
| stream_ds = StreamingNeuroLexDataset(epoch_size=100) |
| stream_item = stream_ds[0] |
| assert stream_item['input_ids'].shape == (24,) |
| |
| |
| assert len(LANGUAGE_WORDS) >= 25, f"Only {len(LANGUAGE_WORDS)} languages" |
| total_words = sum(len(v) for v in LANGUAGE_WORDS.values()) |
| total_brands = sum(len(v) for v in DOMAIN_NAMES.values()) |
| |
| print(f"✅ Dataset works ({len(data)} samples, {total_words} lang words, {total_brands} brands)") |
| return True |
|
|
|
|
| def test_dataloader(): |
| """Test DataLoader integration.""" |
| print("Testing dataloader...", end=" ") |
| from neurolex_v4_dataset import create_dataloaders |
| |
| train_loader, val_loader = create_dataloaders( |
| batch_size=32, n_samples=500, num_workers=0 |
| ) |
| |
| batch = next(iter(train_loader)) |
| assert batch['input_ids'].shape == (32, 24), f"Wrong batch shape: {batch['input_ids'].shape}" |
| assert batch['domain'].shape == (32,) |
| assert batch['style'].shape == (32,) |
| assert batch['language'].shape == (32,) |
| assert batch['length'].shape == (32,) |
| |
| print(f"✅ Dataloader works (train={len(train_loader)} batches, val={len(val_loader)} batches)") |
| return True |
|
|
|
|
| def test_training_step(): |
| """Test one complete training step.""" |
| print("Testing training step...", end=" ") |
| from neurolex_v4_model import NeuroLexV4, NeuroLexConfig |
| from neurolex_v4_dataset import create_dataloaders |
| from torch.optim import AdamW |
| |
| config = NeuroLexConfig(d_model=64, n_heads=4, n_layers=2, d_ff=128) |
| model = NeuroLexV4(config) |
| optimizer = AdamW(model.parameters(), lr=1e-3) |
| |
| train_loader, _ = create_dataloaders(batch_size=16, n_samples=100, num_workers=0) |
| batch = next(iter(train_loader)) |
| |
| |
| model.train() |
| loss = model.compute_loss( |
| batch['input_ids'], batch['domain'], batch['style'], |
| batch['language'], batch['length'] |
| ) |
| |
| optimizer.zero_grad() |
| loss.backward() |
| |
| |
| for name, p in model.named_parameters(): |
| if p.grad is not None: |
| assert not torch.isnan(p.grad).any(), f"NaN gradient in {name}" |
| assert not torch.isinf(p.grad).any(), f"Inf gradient in {name}" |
| |
| optimizer.step() |
| |
| |
| loss2 = model.compute_loss( |
| batch['input_ids'], batch['domain'], batch['style'], |
| batch['language'], batch['length'] |
| ) |
| |
| print(f"✅ Training step works (loss: {loss.item():.4f} → {loss2.item():.4f})") |
| return True |
|
|
|
|
| def test_model_sizes(): |
| """Test all model size presets.""" |
| print("Testing model sizes...", end=" ") |
| from neurolex_v4_model import create_model |
| |
| sizes = { |
| 'tiny': (1_000_000, 5_000_000), |
| 'small': (3_000_000, 8_000_000), |
| 'base': (8_000_000, 15_000_000), |
| 'large': (15_000_000, 35_000_000), |
| } |
| |
| for size_name, (min_params, max_params) in sizes.items(): |
| model, config = create_model(size_name) |
| n_params = model.count_parameters() |
| assert min_params <= n_params <= max_params, \ |
| f"{size_name}: {n_params} params not in [{min_params}, {max_params}]" |
| |
| print(f"✅ All model sizes valid") |
| return True |
|
|
|
|
| def run_all_tests(): |
| """Run all tests.""" |
| print("=" * 60) |
| print(" NEUROLEX v4 — VALIDATION TESTS") |
| print("=" * 60) |
| print() |
| |
| tests = [ |
| test_imports, |
| test_tokenizer, |
| test_model_forward, |
| test_loss_computation, |
| test_dataset, |
| test_dataloader, |
| test_training_step, |
| test_generation, |
| test_model_sizes, |
| ] |
| |
| passed = 0 |
| failed = 0 |
| |
| for test_fn in tests: |
| try: |
| if test_fn(): |
| passed += 1 |
| else: |
| failed += 1 |
| except Exception as e: |
| print(f"❌ {test_fn.__name__} FAILED: {e}") |
| import traceback |
| traceback.print_exc() |
| failed += 1 |
| |
| print() |
| print("=" * 60) |
| if failed == 0: |
| print(f" ✅ ALL {passed} TESTS PASSED — Ready to train!") |
| else: |
| print(f" ⚠️ {passed} passed, {failed} FAILED") |
| print("=" * 60) |
| |
| return failed == 0 |
|
|
|
|
| if __name__ == '__main__': |
| success = run_all_tests() |
| sys.exit(0 if success else 1) |
|
|