""" test_model.py ============= Validation test suite for the MiniLM project. Run this script before publishing to HuggingFace to confirm that all components are working correctly end-to-end. Author : André Costa License : MIT Usage: # Run all tests python test_model.py # Run a specific test group only python test_model.py --only tokenizer python test_model.py --only corpus python test_model.py --only model python test_model.py --only generate python test_model.py --only export """ import os import sys import math import argparse import traceback import torch # ───────────────────────────────────────────────────────────── # Helpers # ───────────────────────────────────────────────────────────── PASS = " [PASS]" FAIL = " [FAIL]" SKIP = " [SKIP]" SEP = "─" * 55 results = [] # list of (test_name, passed: bool) def section(title: str) -> None: print(f"\n{SEP}") print(f" {title}") print(SEP) def check(name: str, condition: bool, detail: str = "") -> bool: status = PASS if condition else FAIL line = f"{status} {name}" if detail: line += f"\n {detail}" print(line) results.append((name, condition)) return condition def skip(name: str, reason: str) -> None: print(f"{SKIP} {name} ({reason})") results.append((name, None)) def summary() -> None: print(f"\n{SEP}") print(" Summary") print(SEP) passed = sum(1 for _, r in results if r is True) failed = sum(1 for _, r in results if r is False) skipped = sum(1 for _, r in results if r is None) total = passed + failed print(f" Passed : {passed}/{total}") print(f" Failed : {failed}/{total}") if skipped: print(f" Skipped : {skipped}") print(SEP) if failed > 0: print("\n Fix the failed tests before publishing.\n") sys.exit(1) else: print("\n All tests passed. Ready to export and publish.\n") # ───────────────────────────────────────────────────────────── # Test groups # ───────────────────────────────────────────────────────────── def test_tokenizer() -> None: section("1 — BPE Tokenizer") # 1.1 — tokenizer files exist tok_ok = check( "Tokenizer files exist (./tokenizer/)", os.path.isfile("./tokenizer/tokenizer.json") and os.path.isfile("./tokenizer/vocab.json"), "Run 'python bpe_tokenizer.py' first." ) if not tok_ok: skip("Tokenizer load", "tokenizer files missing") skip("Encode / decode", "tokenizer files missing") skip("Vocab size", "tokenizer files missing") skip("No UNK tokens", "tokenizer files missing") return # 1.2 — load without errors try: from bpe_tokenizer import BPETokenizer tokenizer = BPETokenizer.load("./tokenizer") check("Tokenizer loads without errors", True) except Exception as e: check("Tokenizer loads without errors", False, str(e)) skip("Encode / decode", "load failed") skip("Vocab size", "load failed") skip("No UNK tokens", "load failed") return # 1.3 — vocab size check( "Vocab size == 16384", tokenizer.vocab_size == 16384, f"Got vocab_size={tokenizer.vocab_size}" ) # 1.4 — encode / decode round-trip test_strings = [ "Hello, world!", "Once upon a time there was a little girl.", "Olá mundo! Aprendizado de máquina.", "The quick brown fox jumps over the lazy dog.", "Redes neurais aprendem padrões complexos.", ] all_ok = True for text in test_strings: ids = tokenizer.encode(text) decoded = tokenizer.decode(ids) if decoded != text: all_ok = False check(f"Encode/decode: {repr(text)}", False, f"Expected {repr(text)}, got {repr(decoded)}") check("Encode/decode round-trip (5 strings)", all_ok) # 1.5 — no UNK tokens (BPE on bytes should encode everything) exotic = "こんにちは 🚀 مرحبا" try: ids = tokenizer.encode(exotic) decoded = tokenizer.decode(ids) check("Encodes non-Latin text without errors", True) except Exception as e: check("Encodes non-Latin text without errors", False, str(e)) def test_corpus() -> None: section("2 — Corpus") # 2.1 — corpus directories exist for split in ["train", "val", "test"]: path = f"./corpus/{split}" exists = os.path.isdir(path) and len(os.listdir(path)) > 0 check( f"Corpus split exists: {split}", exists, "Run 'python data_pipeline.py' first." if not exists else "" ) if not os.path.isdir("./corpus/train"): skip("Corpus loads via CorpusDataset", "corpus missing") skip("Corpus chunk shape", "corpus missing") skip("Corpus token range", "corpus missing") return # 2.2 — loads via CorpusDataset try: from data_pipeline import CorpusDataset dataset = CorpusDataset("./corpus/train") check( "CorpusDataset loads without errors", len(dataset) > 0, f"Chunks: {len(dataset):,}" ) except Exception as e: check("CorpusDataset loads without errors", False, str(e)) skip("Corpus chunk shape", "load failed") skip("Corpus token range", "load failed") return # 2.3 — chunk shape sample = dataset[0] check( "Chunk shape == (512,)", sample.shape == (512,), f"Got shape {sample.shape}" ) # 2.4 — token IDs within vocab range from bpe_tokenizer import BPETokenizer tokenizer = BPETokenizer.load("./tokenizer") vocab_size = tokenizer.vocab_size bad_ids = [(sample < 0).sum().item(), (sample >= vocab_size).sum().item()] check( "All token IDs within vocab range", bad_ids[0] == 0 and bad_ids[1] == 0, f"{bad_ids[0]} negative, {bad_ids[1]} out-of-range IDs found" ) def test_model() -> None: section("3 — Model (forward pass)") try: from transformer import MiniLM, ModelConfig except Exception as e: check("transformer.py imports", False, str(e)) return check("transformer.py imports", True) # 3.1 — instantiate try: config = ModelConfig() model = MiniLM(config) check( "Model instantiates", True, f"{config.n_params / 1e6:.1f}M parameters" ) except Exception as e: check("Model instantiates", False, str(e)) skip("Forward pass", "instantiation failed") skip("Loss ~ log(vocab)", "instantiation failed") skip("Loss decreases", "instantiation failed") return device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model.to(device) model.eval() # 3.2 — forward pass without errors try: B, T = 2, 64 ids = torch.randint(0, config.vocab_size, (B, T + 1)).to(device) inputs = ids[:, :-1].contiguous() targets = ids[:, 1:].contiguous() with torch.no_grad(): logits, loss = model(inputs, targets) check( "Forward pass runs without errors", logits.shape == (B, T, config.vocab_size), f"logits shape: {logits.shape}" ) except Exception as e: check("Forward pass runs without errors", False, str(e)) skip("Loss ~ log(vocab)", "forward pass failed") skip("Loss decreases", "forward pass failed") return # 3.3 — initial loss should be near log(vocab_size) — maximum entropy expected_loss = math.log(config.vocab_size) tolerance = expected_loss * 0.5 # within 50% actual_loss = loss.item() check( f"Initial loss near log(vocab_size) = {expected_loss:.2f}", abs(actual_loss - expected_loss) < tolerance, f"Got loss={actual_loss:.4f}, expected ~{expected_loss:.4f}" ) # 3.4 — model can compute gradients without errors # Note: we only verify that backward() runs cleanly. # Loss may not decrease in 5 steps with random data on an already # trained model — that is expected and not a sign of a problem. try: model.train() optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3) for _ in range(3): optimizer.zero_grad() ids = torch.randint(0, config.vocab_size, (2, 65)).to(device) _, loss = model(ids[:, :-1].contiguous(), ids[:, 1:].contiguous()) loss.backward() optimizer.step() check( "Backward pass runs without errors", True, f"Final loss: {loss.item():.4f}" ) except Exception as e: check("Backward pass runs without errors", False, str(e)) def test_generate() -> None: section("4 — Text Generation") # Requires a trained checkpoint ckpt_path = "./checkpoints/best_model.pt" if not os.path.isfile(ckpt_path): skip("Load checkpoint", "best_model.pt not found — train first") skip("Generate tokens", "checkpoint missing") skip("Output length correct", "checkpoint missing") return try: from transformer import MiniLM, ModelConfig from bpe_tokenizer import BPETokenizer ckpt = torch.load(ckpt_path, map_location="cpu", weights_only=True) cfg_dict = ckpt["model_config"] cfg_dict.pop("d_head", None) # derived in __post_init__, not a constructor arg config = ModelConfig(**cfg_dict) model = MiniLM(config) # strip _orig_mod. prefix added by torch.compile() state_dict = ckpt["model_state"] if any(k.startswith("_orig_mod.") for k in state_dict): state_dict = {k.replace("_orig_mod.", ""): v for k, v in state_dict.items()} model.load_state_dict(state_dict) model.eval() check("Checkpoint loads without errors", True) except Exception as e: check("Checkpoint loads without errors", False, str(e)) skip("Generate tokens", "load failed") skip("Output length correct", "load failed") return try: tokenizer = BPETokenizer.load("./tokenizer") prompts = ["Once upon a time", "The model learned"] n_new = 20 for prompt in prompts: input_ids = torch.tensor([tokenizer.encode(prompt)]) with torch.no_grad(): output = model.generate( input_ids, max_new_tokens=n_new, temperature=0.8, top_k=50, ) generated_text = tokenizer.decode(output[0].tolist()) n_generated = output.shape[1] - input_ids.shape[1] check( f"Generates {n_new} tokens from: {repr(prompt)}", n_generated == n_new, f"Output: {repr(generated_text)}" ) except Exception as e: check("Generate tokens", False, str(e)) def test_export() -> None: section("5 — HuggingFace Export") export_dir = "./hf_export" if not os.path.isdir(export_dir): skip("Export files exist", "hf_export/ not found — run --mode export first") skip("config.json valid", "hf_export/ not found") skip("Weights file exists","hf_export/ not found") skip("Model card exists", "hf_export/ not found") skip("Tokenizer files", "hf_export/ not found") return # 5.1 — required files required = [ "config.json", "README.md", "tokenizer.json", "vocab.json", ] for fname in required: path = os.path.join(export_dir, fname) check(f"Export file exists: {fname}", os.path.isfile(path)) # weights — either safetensors or .bin has_weights = ( os.path.isfile(os.path.join(export_dir, "model.safetensors")) or os.path.isfile(os.path.join(export_dir, "pytorch_model.bin")) ) check("Model weights file exists (safetensors or .bin)", has_weights) # 5.2 — config.json is valid JSON with required fields try: import json with open(os.path.join(export_dir, "config.json")) as f: cfg = json.load(f) required_keys = [ "vocab_size", "hidden_size", "num_hidden_layers", "num_attention_heads", "intermediate_size" ] missing = [k for k in required_keys if k not in cfg] check( "config.json contains required fields", len(missing) == 0, f"Missing: {missing}" if missing else "" ) except Exception as e: check("config.json is valid", False, str(e)) # ───────────────────────────────────────────────────────────── # Entry point # ───────────────────────────────────────────────────────────── if __name__ == "__main__": parser = argparse.ArgumentParser(description="MiniLM — pre-publication test suite") parser.add_argument( "--only", choices=["tokenizer", "corpus", "model", "generate", "export"], default=None, help="Run only a specific test group" ) args = parser.parse_args() print("=" * 55) print(" MiniLM — Pre-publication Test Suite") print("=" * 55) groups = { "tokenizer": test_tokenizer, "corpus": test_corpus, "model": test_model, "generate": test_generate, "export": test_export, } if args.only: groups[args.only]() else: for fn in groups.values(): try: fn() except Exception as e: print(f"\n [ERROR] Unexpected error in {fn.__name__}:") traceback.print_exc() summary()