guru / tests /test_encoder.py
tejadabheja's picture
Upload folder using huggingface_hub
a5ae1ac verified
"""
Tests for the Encoder.
Verifies against HLD spec:
- Word-level: direct lookup in embedding table
- Sentence-level: weighted average of word vectors
- OOV: zero vector (honest — "I don't have this word")
- Vectors are normalized (for cosine similarity)
- No training — just lookup
Uses synthetic vocabulary to avoid downloading GloVe in tests.
"""
import sys
from pathlib import Path
import numpy as np
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
from encoder import Encoder
DIM = 300
def make_vocab():
"""Small synthetic vocabulary for testing."""
rng = np.random.RandomState(42)
words = {
"shakespeare": rng.randn(DIM).astype(np.float32),
"hamlet": rng.randn(DIM).astype(np.float32),
"wrote": rng.randn(DIM).astype(np.float32),
"playwright": rng.randn(DIM).astype(np.float32),
"english": rng.randn(DIM).astype(np.float32),
"cat": rng.randn(DIM).astype(np.float32),
"dog": rng.randn(DIM).astype(np.float32),
"the": rng.randn(DIM).astype(np.float32),
"who": rng.randn(DIM).astype(np.float32),
"is": rng.randn(DIM).astype(np.float32),
}
return words
def make_encoder():
enc = Encoder(data_dir="/tmp/test_encoder", dim=DIM)
enc.load_from_dict(make_vocab())
return enc
class TestEncoderWord:
def test_known_word_returns_vector(self):
enc = make_encoder()
v = enc.encode_word("shakespeare")
assert v.shape == (DIM,)
assert np.linalg.norm(v) > 0
def test_known_word_is_normalized(self):
enc = make_encoder()
v = enc.encode_word("hamlet")
assert abs(np.linalg.norm(v) - 1.0) < 1e-5
def test_oov_returns_zero(self):
"""OOV = zero vector. Honest about not knowing."""
enc = make_encoder()
v = enc.encode_word("glorpnax")
assert np.all(v == 0)
def test_case_insensitive(self):
enc = make_encoder()
v1 = enc.encode_word("Shakespeare")
v2 = enc.encode_word("shakespeare")
np.testing.assert_array_equal(v1, v2)
def test_strips_whitespace(self):
enc = make_encoder()
v1 = enc.encode_word(" hamlet ")
v2 = enc.encode_word("hamlet")
np.testing.assert_array_equal(v1, v2)
def test_has_word(self):
enc = make_encoder()
assert enc.has_word("cat") is True
assert enc.has_word("glorpnax") is False
def test_returns_copy_not_reference(self):
"""Modifying returned vector should not change the vocab."""
enc = make_encoder()
v1 = enc.encode_word("cat")
v1[0] = 999.0
v2 = enc.encode_word("cat")
assert v2[0] != 999.0
class TestEncoderSentence:
def test_single_word_sentence(self):
enc = make_encoder()
v_word = enc.encode_word("hamlet")
v_sent = enc.encode_sentence("hamlet")
# Single word sentence should equal the word vector (both normalized)
np.testing.assert_array_almost_equal(v_word, v_sent, decimal=5)
def test_sentence_is_normalized(self):
enc = make_encoder()
v = enc.encode_sentence("who wrote hamlet")
norm = np.linalg.norm(v)
if norm > 0:
assert abs(norm - 1.0) < 1e-5
def test_all_oov_returns_zero(self):
"""Sentence with no known words → zero vector. Honest abstention."""
enc = make_encoder()
v = enc.encode_sentence("glorpnax zibble fweep")
assert np.all(v == 0)
def test_empty_string_returns_zero(self):
enc = make_encoder()
v = enc.encode_sentence("")
assert np.all(v == 0)
def test_mixed_known_oov(self):
"""OOV words are skipped, known words contribute."""
enc = make_encoder()
v_pure = enc.encode_sentence("hamlet")
v_mixed = enc.encode_sentence("glorpnax hamlet zibble")
# Should be close to "hamlet" since it's the only known word
sim = float(np.dot(v_pure, v_mixed))
assert sim > 0.99
def test_different_sentences_different_vectors(self):
enc = make_encoder()
v1 = enc.encode_sentence("who wrote hamlet")
v2 = enc.encode_sentence("the cat is english")
# Different sentences should produce different vectors
sim = float(np.dot(v1, v2))
assert sim < 0.99
def test_word_order_matters(self):
"""Position weighting means order changes the vector."""
enc = make_encoder()
v1 = enc.encode_sentence("cat dog")
v2 = enc.encode_sentence("dog cat")
# Should be similar but not identical
sim = float(np.dot(v1, v2))
assert sim > 0.9 # mostly the same words
assert sim < 1.0 # but order differs
def test_punctuation_stripped(self):
enc = make_encoder()
v1 = enc.encode_sentence("who wrote hamlet?")
v2 = enc.encode_sentence("who wrote hamlet")
np.testing.assert_array_almost_equal(v1, v2, decimal=5)
def test_tokenization(self):
enc = make_encoder()
tokens = enc._tokenize("Who wrote Hamlet?")
assert tokens == ["who", "wrote", "hamlet"]
def test_tokenization_special_chars(self):
enc = make_encoder()
tokens = enc._tokenize("cat's dog-eared, the.")
assert tokens == ["cat", "s", "dog", "eared", "the"]
class TestEncoderNearestWords:
def test_nearest_to_itself(self):
"""A word's vector should be nearest to itself."""
enc = make_encoder()
v = enc.encode_word("hamlet")
nearest = enc.nearest_words(v, k=1)
assert len(nearest) == 1
assert nearest[0][0] == "hamlet"
assert abs(nearest[0][1] - 1.0) < 1e-5
def test_nearest_returns_k(self):
enc = make_encoder()
v = enc.encode_word("cat")
nearest = enc.nearest_words(v, k=3)
assert len(nearest) == 3
def test_nearest_sorted_by_similarity(self):
enc = make_encoder()
v = enc.encode_word("cat")
nearest = enc.nearest_words(v, k=5)
sims = [s for _, s in nearest]
assert sims == sorted(sims, reverse=True)
def test_nearest_zero_vector(self):
"""Zero vector = OOV. No nearest words."""
enc = make_encoder()
nearest = enc.nearest_words(np.zeros(DIM, dtype=np.float32))
assert nearest == []
class TestEncoderVocab:
def test_vocab_size(self):
enc = make_encoder()
assert enc.vocab_size == 10
def test_load_from_dict(self):
enc = Encoder(data_dir="/tmp/test", dim=DIM)
assert enc.vocab_size == 0
enc.load_from_dict({"hello": np.ones(DIM)})
assert enc.vocab_size == 1
def test_vectors_normalized_on_load(self):
"""All loaded vectors should be unit-length."""
enc = Encoder(data_dir="/tmp/test", dim=DIM)
enc.load_from_dict({"word": np.ones(DIM) * 5.0})
v = enc.encode_word("word")
assert abs(np.linalg.norm(v) - 1.0) < 1e-5
if __name__ == "__main__":
import pytest
pytest.main([__file__, "-v"])