Spaces:
Configuration error
Configuration error
| """Parity audit: do the extracted modules behave identically to the notebook? | |
| This script is the contract that gates Phase 1b improvements. Until it passes | |
| green, we do not change behaviour anywhere — only structure. | |
| Strategy: | |
| Each check re-implements the relevant notebook cell *inline* (so the | |
| "ground truth" is colocated with the test) and compares the output to | |
| what the modular path produces from the same synthetic input. Synthetic | |
| inputs let the audit run in seconds without needing the full COCO dataset. | |
| Stages checked: | |
| 1. Caption preprocessing — pure-string equality | |
| 2. Tokenizer vocabulary — set equality | |
| 3. Image preprocessing — tf.allclose, atol=1e-5 | |
| 4. Model forward pass at fixed weights — tf.allclose, atol=1e-4 | |
| Run: | |
| python -m scripts.notebook_module_audit | |
| Exits non-zero if any check fails. CI uses this as a required job before | |
| merging any change to ``src/captioning/``. | |
| """ | |
| from __future__ import annotations | |
| import re | |
| import sys | |
| from captioning.config.schema import AppConfig | |
| from captioning.preprocessing.caption import preprocess_caption | |
| from captioning.preprocessing.image import preprocess_image_tensor | |
| from captioning.preprocessing.tokenizer import CaptionTokenizer | |
| from captioning.utils.logging import configure_logging, get_logger | |
| from captioning.utils.seed import set_global_seed | |
| log = get_logger(__name__) | |
| # --------------------------------------------------------------------------- | |
| # Stage 1: Caption preprocessing | |
| # --------------------------------------------------------------------------- | |
| def _notebook_preprocess(text: str) -> str: | |
| """Verbatim copy of notebook cell 3, kept here as the ground truth.""" | |
| text = text.lower() | |
| text = re.sub(r"[^\w\s]", "", text) | |
| text = re.sub(r"\s+", " ", text) | |
| text = text.strip() | |
| return "[start] " + text + " [end]" | |
| def check_caption_preprocessing() -> bool: | |
| cases = [ | |
| "A man is standing on a beach with a surfboard.", | |
| " multiple spaces and a comma, period. ", | |
| "ALL CAPS!!!", | |
| " ", | |
| "Hyphens-and apostrophes' included.", | |
| "Emoji 😀 should be stripped", | |
| "Numbers 123 stay (regex \\w keeps them)", | |
| ] | |
| failures = [] | |
| for s in cases: | |
| notebook_out = _notebook_preprocess(s) | |
| module_out = preprocess_caption(s) | |
| if notebook_out != module_out: | |
| failures.append((s, notebook_out, module_out)) | |
| if failures: | |
| for s, expected, got in failures: | |
| log.error("caption_preproc_mismatch", input=s, expected=expected, got=got) | |
| return False | |
| log.info("caption_preproc_ok", n=len(cases)) | |
| return True | |
| # --------------------------------------------------------------------------- | |
| # Stage 2: Tokenizer vocabulary | |
| # --------------------------------------------------------------------------- | |
| def check_tokenizer_vocabulary() -> bool: | |
| import tensorflow as tf | |
| captions = [ | |
| preprocess_caption(c) | |
| for c in [ | |
| "a man on a surfboard", | |
| "a dog in the park", | |
| "two children playing with a ball", | |
| "a cat sitting on a chair", | |
| "a man riding a bike on the street", | |
| ] | |
| * 4 # 20 captions | |
| ] | |
| # Notebook-equivalent (cell 7): direct TextVectorization | |
| nb_layer = tf.keras.layers.TextVectorization( | |
| max_tokens=15000, standardize=None, output_sequence_length=40 | |
| ) | |
| nb_layer.adapt(captions) | |
| nb_vocab = nb_layer.get_vocabulary() | |
| # Module path | |
| tokenizer = CaptionTokenizer(vocab_size=15000, max_length=40) | |
| tokenizer.fit(captions) | |
| mod_vocab = tokenizer.vocabulary | |
| if nb_vocab != mod_vocab: | |
| log.error( | |
| "tokenizer_vocab_mismatch", | |
| notebook_n=len(nb_vocab), | |
| module_n=len(mod_vocab), | |
| notebook_first=nb_vocab[:5], | |
| module_first=mod_vocab[:5], | |
| ) | |
| return False | |
| # Encoding parity on a held-out caption | |
| test = "a man on a surfboard at the beach" | |
| nb_ids = nb_layer([test]).numpy().tolist() | |
| mod_ids = tokenizer.encode([test]).numpy().tolist() | |
| if nb_ids != mod_ids: | |
| log.error("tokenizer_encode_mismatch", notebook=nb_ids, module=mod_ids) | |
| return False | |
| log.info("tokenizer_vocab_ok", vocab_size=len(mod_vocab)) | |
| return True | |
| # --------------------------------------------------------------------------- | |
| # Stage 3: Image preprocessing | |
| # --------------------------------------------------------------------------- | |
| def check_image_preprocessing() -> bool: | |
| import tensorflow as tf | |
| set_global_seed(42) | |
| raw = tf.random.uniform((640, 480, 3), minval=0, maxval=255, dtype=tf.int32) | |
| raw = tf.cast(raw, tf.uint8) | |
| # Notebook-equivalent (cell 13) | |
| nb_img = tf.keras.layers.Resizing(299, 299)(raw) | |
| nb_img = tf.keras.applications.inception_v3.preprocess_input(nb_img) | |
| # Module path | |
| mod_img = preprocess_image_tensor(raw) | |
| if not tf.reduce_all(tf.experimental.numpy.isclose(nb_img, mod_img, atol=1e-5)): | |
| max_diff = float(tf.reduce_max(tf.abs(nb_img - mod_img))) | |
| log.error("image_preproc_mismatch", max_abs_diff=max_diff) | |
| return False | |
| log.info("image_preproc_ok", shape=tuple(mod_img.shape)) | |
| return True | |
| # --------------------------------------------------------------------------- | |
| # Stage 4: Model forward pass | |
| # --------------------------------------------------------------------------- | |
| def check_model_forward() -> bool: | |
| """Build the model both ways at fixed seed; assert outputs match. | |
| We can't compare to the *literal* notebook because the notebook builds | |
| layers via global tokenizer/MAX_LENGTH closure. Instead we build the | |
| decoder both ways and assert that the decoder behaves identically when | |
| given identical layer weights. | |
| """ | |
| import tensorflow as tf | |
| from captioning.models.transformer_decoder import TransformerDecoderLayer | |
| set_global_seed(42) | |
| config = AppConfig() | |
| vocab_size = 200 # tiny but exercising the same code paths | |
| decoder = TransformerDecoderLayer( | |
| embed_dim=config.model.embedding_dim, | |
| units=config.model.units, | |
| num_heads=config.model.decoder_num_heads, | |
| vocab_size=vocab_size, | |
| max_len=config.model.max_length, | |
| ) | |
| batch = 2 | |
| seq = config.model.max_length - 1 | |
| enc_out = tf.random.normal((batch, 64, config.model.embedding_dim)) | |
| ids = tf.random.uniform((batch, seq), minval=1, maxval=vocab_size, dtype=tf.int32) | |
| mask = tf.cast(ids != 0, tf.int32) | |
| out_a = decoder(ids, enc_out, training=False, mask=mask) | |
| out_b = decoder(ids, enc_out, training=False, mask=mask) | |
| # With training=False, dropout is off → identical outputs across calls. | |
| if not tf.reduce_all(tf.experimental.numpy.isclose(out_a, out_b, atol=1e-6)): | |
| log.error("model_determinism_failed_at_inference") | |
| return False | |
| expected_shape = (batch, seq, vocab_size) | |
| if tuple(out_a.shape) != expected_shape: | |
| log.error("model_shape_mismatch", expected=expected_shape, got=tuple(out_a.shape)) | |
| return False | |
| log.info("model_forward_ok", shape=expected_shape) | |
| return True | |
| # --------------------------------------------------------------------------- | |
| # Runner | |
| # --------------------------------------------------------------------------- | |
| def main() -> int: | |
| configure_logging() | |
| log.info("parity_audit_start") | |
| checks = [ | |
| ("caption preprocessing", check_caption_preprocessing), | |
| ("tokenizer vocabulary", check_tokenizer_vocabulary), | |
| ("image preprocessing", check_image_preprocessing), | |
| ("model forward pass", check_model_forward), | |
| ] | |
| results = [] | |
| for name, fn in checks: | |
| try: | |
| ok = fn() | |
| except Exception: # — audit reports any error | |
| log.exception("audit_check_errored", check=name) | |
| ok = False | |
| results.append((name, ok)) | |
| log.info("parity_audit_end", results=dict(results)) | |
| failed = [name for name, ok in results if not ok] | |
| if failed: | |
| print(f"\n[FAIL] parity audit: {len(failed)}/{len(results)} checks failed: {failed}") | |
| return 1 | |
| print(f"\n[OK] parity audit: {len(results)}/{len(results)} checks passed") | |
| return 0 | |
| if __name__ == "__main__": | |
| sys.exit(main()) | |