| """ |
| Tests for the Encoder. |
| |
| Verifies against HLD spec: |
| - Word-level: direct lookup in embedding table |
| - Sentence-level: weighted average of word vectors |
| - OOV: zero vector (honest — "I don't have this word") |
| - Vectors are normalized (for cosine similarity) |
| - No training — just lookup |
| |
| Uses synthetic vocabulary to avoid downloading GloVe in tests. |
| """ |
|
|
| import sys |
| from pathlib import Path |
|
|
| import numpy as np |
|
|
| sys.path.insert(0, str(Path(__file__).parent.parent / "src")) |
| from encoder import Encoder |
|
|
| DIM = 300 |
|
|
|
|
| def make_vocab(): |
| """Small synthetic vocabulary for testing.""" |
| rng = np.random.RandomState(42) |
| words = { |
| "shakespeare": rng.randn(DIM).astype(np.float32), |
| "hamlet": rng.randn(DIM).astype(np.float32), |
| "wrote": rng.randn(DIM).astype(np.float32), |
| "playwright": rng.randn(DIM).astype(np.float32), |
| "english": rng.randn(DIM).astype(np.float32), |
| "cat": rng.randn(DIM).astype(np.float32), |
| "dog": rng.randn(DIM).astype(np.float32), |
| "the": rng.randn(DIM).astype(np.float32), |
| "who": rng.randn(DIM).astype(np.float32), |
| "is": rng.randn(DIM).astype(np.float32), |
| } |
| return words |
|
|
|
|
| def make_encoder(): |
| enc = Encoder(data_dir="/tmp/test_encoder", dim=DIM) |
| enc.load_from_dict(make_vocab()) |
| return enc |
|
|
|
|
| class TestEncoderWord: |
|
|
| def test_known_word_returns_vector(self): |
| enc = make_encoder() |
| v = enc.encode_word("shakespeare") |
| assert v.shape == (DIM,) |
| assert np.linalg.norm(v) > 0 |
|
|
| def test_known_word_is_normalized(self): |
| enc = make_encoder() |
| v = enc.encode_word("hamlet") |
| assert abs(np.linalg.norm(v) - 1.0) < 1e-5 |
|
|
| def test_oov_returns_zero(self): |
| """OOV = zero vector. Honest about not knowing.""" |
| enc = make_encoder() |
| v = enc.encode_word("glorpnax") |
| assert np.all(v == 0) |
|
|
| def test_case_insensitive(self): |
| enc = make_encoder() |
| v1 = enc.encode_word("Shakespeare") |
| v2 = enc.encode_word("shakespeare") |
| np.testing.assert_array_equal(v1, v2) |
|
|
| def test_strips_whitespace(self): |
| enc = make_encoder() |
| v1 = enc.encode_word(" hamlet ") |
| v2 = enc.encode_word("hamlet") |
| np.testing.assert_array_equal(v1, v2) |
|
|
| def test_has_word(self): |
| enc = make_encoder() |
| assert enc.has_word("cat") is True |
| assert enc.has_word("glorpnax") is False |
|
|
| def test_returns_copy_not_reference(self): |
| """Modifying returned vector should not change the vocab.""" |
| enc = make_encoder() |
| v1 = enc.encode_word("cat") |
| v1[0] = 999.0 |
| v2 = enc.encode_word("cat") |
| assert v2[0] != 999.0 |
|
|
|
|
| class TestEncoderSentence: |
|
|
| def test_single_word_sentence(self): |
| enc = make_encoder() |
| v_word = enc.encode_word("hamlet") |
| v_sent = enc.encode_sentence("hamlet") |
| |
| np.testing.assert_array_almost_equal(v_word, v_sent, decimal=5) |
|
|
| def test_sentence_is_normalized(self): |
| enc = make_encoder() |
| v = enc.encode_sentence("who wrote hamlet") |
| norm = np.linalg.norm(v) |
| if norm > 0: |
| assert abs(norm - 1.0) < 1e-5 |
|
|
| def test_all_oov_returns_zero(self): |
| """Sentence with no known words → zero vector. Honest abstention.""" |
| enc = make_encoder() |
| v = enc.encode_sentence("glorpnax zibble fweep") |
| assert np.all(v == 0) |
|
|
| def test_empty_string_returns_zero(self): |
| enc = make_encoder() |
| v = enc.encode_sentence("") |
| assert np.all(v == 0) |
|
|
| def test_mixed_known_oov(self): |
| """OOV words are skipped, known words contribute.""" |
| enc = make_encoder() |
| v_pure = enc.encode_sentence("hamlet") |
| v_mixed = enc.encode_sentence("glorpnax hamlet zibble") |
| |
| sim = float(np.dot(v_pure, v_mixed)) |
| assert sim > 0.99 |
|
|
| def test_different_sentences_different_vectors(self): |
| enc = make_encoder() |
| v1 = enc.encode_sentence("who wrote hamlet") |
| v2 = enc.encode_sentence("the cat is english") |
| |
| sim = float(np.dot(v1, v2)) |
| assert sim < 0.99 |
|
|
| def test_word_order_matters(self): |
| """Position weighting means order changes the vector.""" |
| enc = make_encoder() |
| v1 = enc.encode_sentence("cat dog") |
| v2 = enc.encode_sentence("dog cat") |
| |
| sim = float(np.dot(v1, v2)) |
| assert sim > 0.9 |
| assert sim < 1.0 |
|
|
| def test_punctuation_stripped(self): |
| enc = make_encoder() |
| v1 = enc.encode_sentence("who wrote hamlet?") |
| v2 = enc.encode_sentence("who wrote hamlet") |
| np.testing.assert_array_almost_equal(v1, v2, decimal=5) |
|
|
| def test_tokenization(self): |
| enc = make_encoder() |
| tokens = enc._tokenize("Who wrote Hamlet?") |
| assert tokens == ["who", "wrote", "hamlet"] |
|
|
| def test_tokenization_special_chars(self): |
| enc = make_encoder() |
| tokens = enc._tokenize("cat's dog-eared, the.") |
| assert tokens == ["cat", "s", "dog", "eared", "the"] |
|
|
|
|
| class TestEncoderNearestWords: |
|
|
| def test_nearest_to_itself(self): |
| """A word's vector should be nearest to itself.""" |
| enc = make_encoder() |
| v = enc.encode_word("hamlet") |
| nearest = enc.nearest_words(v, k=1) |
| assert len(nearest) == 1 |
| assert nearest[0][0] == "hamlet" |
| assert abs(nearest[0][1] - 1.0) < 1e-5 |
|
|
| def test_nearest_returns_k(self): |
| enc = make_encoder() |
| v = enc.encode_word("cat") |
| nearest = enc.nearest_words(v, k=3) |
| assert len(nearest) == 3 |
|
|
| def test_nearest_sorted_by_similarity(self): |
| enc = make_encoder() |
| v = enc.encode_word("cat") |
| nearest = enc.nearest_words(v, k=5) |
| sims = [s for _, s in nearest] |
| assert sims == sorted(sims, reverse=True) |
|
|
| def test_nearest_zero_vector(self): |
| """Zero vector = OOV. No nearest words.""" |
| enc = make_encoder() |
| nearest = enc.nearest_words(np.zeros(DIM, dtype=np.float32)) |
| assert nearest == [] |
|
|
|
|
| class TestEncoderVocab: |
|
|
| def test_vocab_size(self): |
| enc = make_encoder() |
| assert enc.vocab_size == 10 |
|
|
| def test_load_from_dict(self): |
| enc = Encoder(data_dir="/tmp/test", dim=DIM) |
| assert enc.vocab_size == 0 |
| enc.load_from_dict({"hello": np.ones(DIM)}) |
| assert enc.vocab_size == 1 |
|
|
| def test_vectors_normalized_on_load(self): |
| """All loaded vectors should be unit-length.""" |
| enc = Encoder(data_dir="/tmp/test", dim=DIM) |
| enc.load_from_dict({"word": np.ones(DIM) * 5.0}) |
| v = enc.encode_word("word") |
| assert abs(np.linalg.norm(v) - 1.0) < 1e-5 |
|
|
|
|
| if __name__ == "__main__": |
| import pytest |
| pytest.main([__file__, "-v"]) |
|
|