| """ |
| Tests for multimodal encoding and cross-modal retrieval. |
| |
| Tests the MultimodalEncoder (CLIP-based) and the engine's |
| multimodal teach/query methods. |
| |
| CLIP model tests are marked with @pytest.mark.slow — they |
| download the model on first run (~400MB). |
| """ |
|
|
| import sys |
| import os |
| import numpy as np |
| import pytest |
|
|
| sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'src')) |
|
|
|
|
| |
|
|
| class TestMultimodalEncoderUnit: |
| def test_import_succeeds(self): |
| """The multimodal module should import without errors.""" |
| from multimodal import MultimodalEncoder, CLIP_DIM |
| assert CLIP_DIM == 512 |
|
|
| def test_lazy_model_loading(self): |
| """Model should not load until first encode call.""" |
| from multimodal import MultimodalEncoder |
| enc = MultimodalEncoder.__new__(MultimodalEncoder) |
| enc._model = None |
| enc.model_name = "clip-ViT-B-32" |
| enc.device = "cpu" |
| enc.dim = 512 |
| |
| assert enc._model is None |
|
|
|
|
| |
|
|
| @pytest.mark.slow |
| class TestMultimodalCLIP: |
| @pytest.fixture(scope="class") |
| def encoder(self): |
| from multimodal import MultimodalEncoder |
| return MultimodalEncoder(device="cpu") |
|
|
| def test_text_encoding_shape(self, encoder): |
| """Text encoding should produce a 512-dim normalized vector.""" |
| vec = encoder.encode_text("a cat sitting on a mat") |
| assert vec.shape == (512,) |
| assert abs(np.linalg.norm(vec) - 1.0) < 1e-5 |
|
|
| def test_text_similarity(self, encoder): |
| """Similar texts should produce similar vectors.""" |
| v1 = encoder.encode_text("a cat sitting on a mat") |
| v2 = encoder.encode_text("a kitten on a rug") |
| v3 = encoder.encode_text("the stock market crashed today") |
| sim_close = float(np.dot(v1, v2)) |
| sim_far = float(np.dot(v1, v3)) |
| assert sim_close > sim_far |
|
|
| def test_image_encoding(self, encoder, tmp_path): |
| """Image encoding should produce a 512-dim normalized vector.""" |
| |
| from PIL import Image |
| img = Image.new("RGB", (224, 224), color=(255, 0, 0)) |
| path = str(tmp_path / "red.png") |
| img.save(path) |
|
|
| vec = encoder.encode_image(path) |
| assert vec.shape == (512,) |
| assert abs(np.linalg.norm(vec) - 1.0) < 1e-5 |
|
|
| def test_cross_modal_similarity(self, encoder, tmp_path): |
| """Text and images of similar content should be close in vector space.""" |
| from PIL import Image |
|
|
| |
| red_img = Image.new("RGB", (224, 224), color=(255, 0, 0)) |
| blue_img = Image.new("RGB", (224, 224), color=(0, 0, 255)) |
| red_path = str(tmp_path / "red.png") |
| blue_path = str(tmp_path / "blue.png") |
| red_img.save(red_path) |
| blue_img.save(blue_path) |
|
|
| |
| red_text = encoder.encode_text("a solid red image") |
| red_image = encoder.encode_image(red_path) |
| blue_image = encoder.encode_image(blue_path) |
|
|
| sim_match = float(np.dot(red_text, red_image)) |
| sim_mismatch = float(np.dot(red_text, blue_image)) |
| |
| assert sim_match > sim_mismatch or abs(sim_match - sim_mismatch) < 0.1 |
|
|
| def test_batch_text_encoding(self, encoder): |
| """Batch encoding should produce same results as individual.""" |
| texts = ["hello world", "cat on mat", "quantum physics"] |
| batch = encoder.encode_batch_text(texts) |
| assert batch.shape == (3, 512) |
|
|
| individual = encoder.encode_text(texts[0]) |
| |
| sim = float(np.dot(batch[0], individual)) |
| assert sim > 0.99 |
|
|
| def test_pil_image_direct(self, encoder): |
| """Should accept PIL Image objects directly.""" |
| from PIL import Image |
| img = Image.new("RGB", (100, 100), color=(0, 255, 0)) |
| vec = encoder.encode_image(img) |
| assert vec.shape == (512,) |
|
|
| def test_numpy_image(self, encoder): |
| """Should accept numpy arrays.""" |
| arr = np.zeros((100, 100, 3), dtype=np.uint8) |
| arr[:, :, 1] = 255 |
| vec = encoder.encode_image(arr) |
| assert vec.shape == (512,) |
|
|
|
|
| |
|
|
| @pytest.mark.slow |
| class TestEngineMultimodal: |
| @pytest.fixture |
| def engine(self, tmp_path): |
| from engine import Engine |
| from multimodal import CLIP_DIM |
| |
| engine = Engine(data_dir=str(tmp_path), dim=CLIP_DIM) |
| return engine |
|
|
| def test_teach_image(self, engine, tmp_path): |
| """Teaching an image should create a neuron.""" |
| from PIL import Image |
| img = Image.new("RGB", (224, 224), color=(255, 0, 0)) |
| path = str(tmp_path / "test.png") |
| img.save(path) |
|
|
| before = engine.db.count() |
| neuron = engine.teach_image(path, label="a red square") |
| after = engine.db.count() |
|
|
| assert after > before |
| assert neuron.vector.shape == (512,) |
| engine.close() |
|
|
| def test_teach_image_with_label_creates_link(self, engine, tmp_path): |
| """Teaching with a label should create two neurons linked by successor.""" |
| from PIL import Image |
| img = Image.new("RGB", (224, 224), color=(0, 0, 255)) |
| path = str(tmp_path / "blue.png") |
| img.save(path) |
|
|
| neuron = engine.teach_image(path, label="a blue square") |
| |
| assert engine.db.count() >= 2 |
|
|
| |
| refreshed = engine.db.get(neuron.id) |
| assert len(refreshed.successors) >= 1 |
| engine.close() |
|
|
| def test_query_image(self, engine, tmp_path): |
| """Querying with an image should find similar neurons.""" |
| from PIL import Image |
|
|
| |
| red = Image.new("RGB", (224, 224), color=(255, 0, 0)) |
| red_path = str(tmp_path / "red.png") |
| red.save(red_path) |
| engine.teach_image(red_path, label="red color") |
|
|
| |
| results = engine.query_image(red_path, k=3) |
| assert len(results) >= 1 |
| |
| assert results[0][1] > 0.9 |
| engine.close() |
|
|
| def test_text_to_image_retrieval(self, engine, tmp_path): |
| """Text query should find taught images via cross-modal search.""" |
| from PIL import Image |
|
|
| |
| red = Image.new("RGB", (224, 224), color=(255, 0, 0)) |
| red_path = str(tmp_path / "red.png") |
| red.save(red_path) |
| engine.teach_image(red_path, label="red") |
|
|
| blue = Image.new("RGB", (224, 224), color=(0, 0, 255)) |
| blue_path = str(tmp_path / "blue.png") |
| blue.save(blue_path) |
| engine.teach_image(blue_path, label="blue") |
|
|
| |
| results = engine.query_text_to_image("red color", k=5) |
| assert len(results) >= 1 |
| engine.close() |
|
|
| def test_multimodal_convergence(self, engine, tmp_path): |
| """The convergence loop should work with CLIP vectors.""" |
| from PIL import Image |
|
|
| |
| for color_name, color in [("red", (255, 0, 0)), ("blue", (0, 0, 255)), |
| ("green", (0, 255, 0))]: |
| img = Image.new("RGB", (224, 224), color=color) |
| path = str(tmp_path / f"{color_name}.png") |
| img.save(path) |
| engine.teach_image(path, label=color_name) |
|
|
| |
| vec = engine.multimodal.encode_text("colors") |
| result = engine.convergence.converge(vec) |
| |
| assert result is not None |
| engine.close() |
|
|