File size: 14,771 Bytes

77de42d

"""
test_model.py
=============
Validation test suite for the MiniLM project.

Run this script before publishing to HuggingFace to confirm
that all components are working correctly end-to-end.

Author  : André Costa
License : MIT

Usage:
    # Run all tests
    python test_model.py

    # Run a specific test group only
    python test_model.py --only tokenizer
    python test_model.py --only corpus
    python test_model.py --only model
    python test_model.py --only generate
    python test_model.py --only export
"""

import os
import sys
import math
import argparse
import traceback

import torch

# ─────────────────────────────────────────────────────────────
# Helpers
# ─────────────────────────────────────────────────────────────

PASS  = "  [PASS]"
FAIL  = "  [FAIL]"
SKIP  = "  [SKIP]"
SEP   = "─" * 55

results = []   # list of (test_name, passed: bool)


def section(title: str) -> None:
    print(f"\n{SEP}")
    print(f"  {title}")
    print(SEP)


def check(name: str, condition: bool, detail: str = "") -> bool:
    status = PASS if condition else FAIL
    line   = f"{status}  {name}"
    if detail:
        line += f"\n         {detail}"
    print(line)
    results.append((name, condition))
    return condition


def skip(name: str, reason: str) -> None:
    print(f"{SKIP}  {name}  ({reason})")
    results.append((name, None))


def summary() -> None:
    print(f"\n{SEP}")
    print("  Summary")
    print(SEP)
    passed  = sum(1 for _, r in results if r is True)
    failed  = sum(1 for _, r in results if r is False)
    skipped = sum(1 for _, r in results if r is None)
    total   = passed + failed
    print(f"  Passed  : {passed}/{total}")
    print(f"  Failed  : {failed}/{total}")
    if skipped:
        print(f"  Skipped : {skipped}")
    print(SEP)
    if failed > 0:
        print("\n  Fix the failed tests before publishing.\n")
        sys.exit(1)
    else:
        print("\n  All tests passed. Ready to export and publish.\n")


# ─────────────────────────────────────────────────────────────
# Test groups
# ─────────────────────────────────────────────────────────────

def test_tokenizer() -> None:
    section("1 — BPE Tokenizer")

    # 1.1 — tokenizer files exist
    tok_ok = check(
        "Tokenizer files exist (./tokenizer/)",
        os.path.isfile("./tokenizer/tokenizer.json") and
        os.path.isfile("./tokenizer/vocab.json"),
        "Run 'python bpe_tokenizer.py' first."
    )
    if not tok_ok:
        skip("Tokenizer load",    "tokenizer files missing")
        skip("Encode / decode",   "tokenizer files missing")
        skip("Vocab size",        "tokenizer files missing")
        skip("No UNK tokens",     "tokenizer files missing")
        return

    # 1.2 — load without errors
    try:
        from bpe_tokenizer import BPETokenizer
        tokenizer = BPETokenizer.load("./tokenizer")
        check("Tokenizer loads without errors", True)
    except Exception as e:
        check("Tokenizer loads without errors", False, str(e))
        skip("Encode / decode", "load failed")
        skip("Vocab size",      "load failed")
        skip("No UNK tokens",   "load failed")
        return

    # 1.3 — vocab size
    check(
        "Vocab size == 16384",
        tokenizer.vocab_size == 16384,
        f"Got vocab_size={tokenizer.vocab_size}"
    )

    # 1.4 — encode / decode round-trip
    test_strings = [
        "Hello, world!",
        "Once upon a time there was a little girl.",
        "Olá mundo! Aprendizado de máquina.",
        "The quick brown fox jumps over the lazy dog.",
        "Redes neurais aprendem padrões complexos.",
    ]
    all_ok = True
    for text in test_strings:
        ids     = tokenizer.encode(text)
        decoded = tokenizer.decode(ids)
        if decoded != text:
            all_ok = False
            check(f"Encode/decode: {repr(text)}", False,
                  f"Expected {repr(text)}, got {repr(decoded)}")
    check("Encode/decode round-trip (5 strings)", all_ok)

    # 1.5 — no UNK tokens (BPE on bytes should encode everything)
    exotic = "こんにちは 🚀 مرحبا"
    try:
        ids     = tokenizer.encode(exotic)
        decoded = tokenizer.decode(ids)
        check("Encodes non-Latin text without errors", True)
    except Exception as e:
        check("Encodes non-Latin text without errors", False, str(e))


def test_corpus() -> None:
    section("2 — Corpus")

    # 2.1 — corpus directories exist
    for split in ["train", "val", "test"]:
        path   = f"./corpus/{split}"
        exists = os.path.isdir(path) and len(os.listdir(path)) > 0
        check(
            f"Corpus split exists: {split}",
            exists,
            "Run 'python data_pipeline.py' first." if not exists else ""
        )

    if not os.path.isdir("./corpus/train"):
        skip("Corpus loads via CorpusDataset", "corpus missing")
        skip("Corpus chunk shape",             "corpus missing")
        skip("Corpus token range",             "corpus missing")
        return

    # 2.2 — loads via CorpusDataset
    try:
        from data_pipeline import CorpusDataset
        dataset = CorpusDataset("./corpus/train")
        check(
            "CorpusDataset loads without errors",
            len(dataset) > 0,
            f"Chunks: {len(dataset):,}"
        )
    except Exception as e:
        check("CorpusDataset loads without errors", False, str(e))
        skip("Corpus chunk shape", "load failed")
        skip("Corpus token range", "load failed")
        return

    # 2.3 — chunk shape
    sample = dataset[0]
    check(
        "Chunk shape == (512,)",
        sample.shape == (512,),
        f"Got shape {sample.shape}"
    )

    # 2.4 — token IDs within vocab range
    from bpe_tokenizer import BPETokenizer
    tokenizer  = BPETokenizer.load("./tokenizer")
    vocab_size = tokenizer.vocab_size

    bad_ids = [(sample < 0).sum().item(), (sample >= vocab_size).sum().item()]
    check(
        "All token IDs within vocab range",
        bad_ids[0] == 0 and bad_ids[1] == 0,
        f"{bad_ids[0]} negative, {bad_ids[1]} out-of-range IDs found"
    )


def test_model() -> None:
    section("3 — Model (forward pass)")

    try:
        from transformer import MiniLM, ModelConfig
    except Exception as e:
        check("transformer.py imports", False, str(e))
        return

    check("transformer.py imports", True)

    # 3.1 — instantiate
    try:
        config = ModelConfig()
        model  = MiniLM(config)
        check(
            "Model instantiates",
            True,
            f"{config.n_params / 1e6:.1f}M parameters"
        )
    except Exception as e:
        check("Model instantiates", False, str(e))
        skip("Forward pass",        "instantiation failed")
        skip("Loss ~ log(vocab)",   "instantiation failed")
        skip("Loss decreases",      "instantiation failed")
        return

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model  = model.to(device)
    model.eval()

    # 3.2 — forward pass without errors
    try:
        B, T    = 2, 64
        ids     = torch.randint(0, config.vocab_size, (B, T + 1)).to(device)
        inputs  = ids[:, :-1].contiguous()
        targets = ids[:, 1:].contiguous()
        with torch.no_grad():
            logits, loss = model(inputs, targets)
        check(
            "Forward pass runs without errors",
            logits.shape == (B, T, config.vocab_size),
            f"logits shape: {logits.shape}"
        )
    except Exception as e:
        check("Forward pass runs without errors", False, str(e))
        skip("Loss ~ log(vocab)", "forward pass failed")
        skip("Loss decreases",    "forward pass failed")
        return

    # 3.3 — initial loss should be near log(vocab_size) — maximum entropy
    expected_loss = math.log(config.vocab_size)
    tolerance     = expected_loss * 0.5   # within 50%
    actual_loss   = loss.item()
    check(
        f"Initial loss near log(vocab_size) = {expected_loss:.2f}",
        abs(actual_loss - expected_loss) < tolerance,
        f"Got loss={actual_loss:.4f}, expected ~{expected_loss:.4f}"
    )

    # 3.4 — model can compute gradients without errors
    # Note: we only verify that backward() runs cleanly.
    # Loss may not decrease in 5 steps with random data on an already
    # trained model — that is expected and not a sign of a problem.
    try:
        model.train()
        optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
        for _ in range(3):
            optimizer.zero_grad()
            ids     = torch.randint(0, config.vocab_size, (2, 65)).to(device)
            _, loss = model(ids[:, :-1].contiguous(), ids[:, 1:].contiguous())
            loss.backward()
            optimizer.step()
        check(
            "Backward pass runs without errors",
            True,
            f"Final loss: {loss.item():.4f}"
        )
    except Exception as e:
        check("Backward pass runs without errors", False, str(e))


def test_generate() -> None:
    section("4 — Text Generation")

    # Requires a trained checkpoint
    ckpt_path = "./checkpoints/best_model.pt"
    if not os.path.isfile(ckpt_path):
        skip("Load checkpoint",       "best_model.pt not found — train first")
        skip("Generate tokens",       "checkpoint missing")
        skip("Output length correct", "checkpoint missing")
        return

    try:
        from transformer import MiniLM, ModelConfig
        from bpe_tokenizer import BPETokenizer

        ckpt     = torch.load(ckpt_path, map_location="cpu", weights_only=True)
        cfg_dict = ckpt["model_config"]
        cfg_dict.pop("d_head", None)   # derived in __post_init__, not a constructor arg
        config   = ModelConfig(**cfg_dict)
        model    = MiniLM(config)
        # strip _orig_mod. prefix added by torch.compile()
        state_dict = ckpt["model_state"]
        if any(k.startswith("_orig_mod.") for k in state_dict):
            state_dict = {k.replace("_orig_mod.", ""): v for k, v in state_dict.items()}
        model.load_state_dict(state_dict)
        model.eval()
        check("Checkpoint loads without errors", True)
    except Exception as e:
        check("Checkpoint loads without errors", False, str(e))
        skip("Generate tokens",       "load failed")
        skip("Output length correct", "load failed")
        return

    try:
        tokenizer  = BPETokenizer.load("./tokenizer")
        prompts    = ["Once upon a time", "The model learned"]
        n_new      = 20

        for prompt in prompts:
            input_ids = torch.tensor([tokenizer.encode(prompt)])
            with torch.no_grad():
                output = model.generate(
                    input_ids,
                    max_new_tokens=n_new,
                    temperature=0.8,
                    top_k=50,
                )
            generated_text = tokenizer.decode(output[0].tolist())
            n_generated    = output.shape[1] - input_ids.shape[1]

            check(
                f"Generates {n_new} tokens from: {repr(prompt)}",
                n_generated == n_new,
                f"Output: {repr(generated_text)}"
            )
    except Exception as e:
        check("Generate tokens", False, str(e))


def test_export() -> None:
    section("5 — HuggingFace Export")

    export_dir = "./hf_export"

    if not os.path.isdir(export_dir):
        skip("Export files exist", "hf_export/ not found — run --mode export first")
        skip("config.json valid",  "hf_export/ not found")
        skip("Weights file exists","hf_export/ not found")
        skip("Model card exists",  "hf_export/ not found")
        skip("Tokenizer files",    "hf_export/ not found")
        return

    # 5.1 — required files
    required = [
        "config.json",
        "README.md",
        "tokenizer.json",
        "vocab.json",
    ]
    for fname in required:
        path = os.path.join(export_dir, fname)
        check(f"Export file exists: {fname}", os.path.isfile(path))

    # weights — either safetensors or .bin
    has_weights = (
        os.path.isfile(os.path.join(export_dir, "model.safetensors")) or
        os.path.isfile(os.path.join(export_dir, "pytorch_model.bin"))
    )
    check("Model weights file exists (safetensors or .bin)", has_weights)

    # 5.2 — config.json is valid JSON with required fields
    try:
        import json
        with open(os.path.join(export_dir, "config.json")) as f:
            cfg = json.load(f)
        required_keys = [
            "vocab_size", "hidden_size", "num_hidden_layers",
            "num_attention_heads", "intermediate_size"
        ]
        missing = [k for k in required_keys if k not in cfg]
        check(
            "config.json contains required fields",
            len(missing) == 0,
            f"Missing: {missing}" if missing else ""
        )
    except Exception as e:
        check("config.json is valid", False, str(e))


# ─────────────────────────────────────────────────────────────
# Entry point
# ─────────────────────────────────────────────────────────────

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="MiniLM — pre-publication test suite")
    parser.add_argument(
        "--only",
        choices=["tokenizer", "corpus", "model", "generate", "export"],
        default=None,
        help="Run only a specific test group"
    )
    args = parser.parse_args()

    print("=" * 55)
    print("  MiniLM — Pre-publication Test Suite")
    print("=" * 55)

    groups = {
        "tokenizer": test_tokenizer,
        "corpus":    test_corpus,
        "model":     test_model,
        "generate":  test_generate,
        "export":    test_export,
    }

    if args.only:
        groups[args.only]()
    else:
        for fn in groups.values():
            try:
                fn()
            except Exception as e:
                print(f"\n  [ERROR] Unexpected error in {fn.__name__}:")
                traceback.print_exc()

    summary()