guru / tests /test_multimodal.py
tejadabheja's picture
Upload folder using huggingface_hub
a5ae1ac verified
"""
Tests for multimodal encoding and cross-modal retrieval.
Tests the MultimodalEncoder (CLIP-based) and the engine's
multimodal teach/query methods.
CLIP model tests are marked with @pytest.mark.slow — they
download the model on first run (~400MB).
"""
import sys
import os
import numpy as np
import pytest
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'src'))
# --- Unit tests for MultimodalEncoder (no model download) ---
class TestMultimodalEncoderUnit:
def test_import_succeeds(self):
"""The multimodal module should import without errors."""
from multimodal import MultimodalEncoder, CLIP_DIM
assert CLIP_DIM == 512
def test_lazy_model_loading(self):
"""Model should not load until first encode call."""
from multimodal import MultimodalEncoder
enc = MultimodalEncoder.__new__(MultimodalEncoder)
enc._model = None
enc.model_name = "clip-ViT-B-32"
enc.device = "cpu"
enc.dim = 512
# _model should still be None (lazy)
assert enc._model is None
# --- Integration tests with CLIP model ---
@pytest.mark.slow
class TestMultimodalCLIP:
@pytest.fixture(scope="class")
def encoder(self):
from multimodal import MultimodalEncoder
return MultimodalEncoder(device="cpu")
def test_text_encoding_shape(self, encoder):
"""Text encoding should produce a 512-dim normalized vector."""
vec = encoder.encode_text("a cat sitting on a mat")
assert vec.shape == (512,)
assert abs(np.linalg.norm(vec) - 1.0) < 1e-5
def test_text_similarity(self, encoder):
"""Similar texts should produce similar vectors."""
v1 = encoder.encode_text("a cat sitting on a mat")
v2 = encoder.encode_text("a kitten on a rug")
v3 = encoder.encode_text("the stock market crashed today")
sim_close = float(np.dot(v1, v2))
sim_far = float(np.dot(v1, v3))
assert sim_close > sim_far
def test_image_encoding(self, encoder, tmp_path):
"""Image encoding should produce a 512-dim normalized vector."""
# Create a simple test image
from PIL import Image
img = Image.new("RGB", (224, 224), color=(255, 0, 0))
path = str(tmp_path / "red.png")
img.save(path)
vec = encoder.encode_image(path)
assert vec.shape == (512,)
assert abs(np.linalg.norm(vec) - 1.0) < 1e-5
def test_cross_modal_similarity(self, encoder, tmp_path):
"""Text and images of similar content should be close in vector space."""
from PIL import Image
# Create a red image and a blue image
red_img = Image.new("RGB", (224, 224), color=(255, 0, 0))
blue_img = Image.new("RGB", (224, 224), color=(0, 0, 255))
red_path = str(tmp_path / "red.png")
blue_path = str(tmp_path / "blue.png")
red_img.save(red_path)
blue_img.save(blue_path)
# "red" text should be closer to red image than blue image
red_text = encoder.encode_text("a solid red image")
red_image = encoder.encode_image(red_path)
blue_image = encoder.encode_image(blue_path)
sim_match = float(np.dot(red_text, red_image))
sim_mismatch = float(np.dot(red_text, blue_image))
# CLIP should distinguish these
assert sim_match > sim_mismatch or abs(sim_match - sim_mismatch) < 0.1
def test_batch_text_encoding(self, encoder):
"""Batch encoding should produce same results as individual."""
texts = ["hello world", "cat on mat", "quantum physics"]
batch = encoder.encode_batch_text(texts)
assert batch.shape == (3, 512)
individual = encoder.encode_text(texts[0])
# Should be very close (floating point differences from batching)
sim = float(np.dot(batch[0], individual))
assert sim > 0.99
def test_pil_image_direct(self, encoder):
"""Should accept PIL Image objects directly."""
from PIL import Image
img = Image.new("RGB", (100, 100), color=(0, 255, 0))
vec = encoder.encode_image(img)
assert vec.shape == (512,)
def test_numpy_image(self, encoder):
"""Should accept numpy arrays."""
arr = np.zeros((100, 100, 3), dtype=np.uint8)
arr[:, :, 1] = 255 # green
vec = encoder.encode_image(arr)
assert vec.shape == (512,)
# --- Engine multimodal integration tests ---
@pytest.mark.slow
class TestEngineMultimodal:
@pytest.fixture
def engine(self, tmp_path):
from engine import Engine
from multimodal import CLIP_DIM
# Use CLIP dim (512) for multimodal engine
engine = Engine(data_dir=str(tmp_path), dim=CLIP_DIM)
return engine
def test_teach_image(self, engine, tmp_path):
"""Teaching an image should create a neuron."""
from PIL import Image
img = Image.new("RGB", (224, 224), color=(255, 0, 0))
path = str(tmp_path / "test.png")
img.save(path)
before = engine.db.count()
neuron = engine.teach_image(path, label="a red square")
after = engine.db.count()
assert after > before
assert neuron.vector.shape == (512,)
engine.close()
def test_teach_image_with_label_creates_link(self, engine, tmp_path):
"""Teaching with a label should create two neurons linked by successor."""
from PIL import Image
img = Image.new("RGB", (224, 224), color=(0, 0, 255))
path = str(tmp_path / "blue.png")
img.save(path)
neuron = engine.teach_image(path, label="a blue square")
# Should have created 2 neurons (image + label)
assert engine.db.count() >= 2
# Image neuron should have successor (the label)
refreshed = engine.db.get(neuron.id)
assert len(refreshed.successors) >= 1
engine.close()
def test_query_image(self, engine, tmp_path):
"""Querying with an image should find similar neurons."""
from PIL import Image
# Teach a red image
red = Image.new("RGB", (224, 224), color=(255, 0, 0))
red_path = str(tmp_path / "red.png")
red.save(red_path)
engine.teach_image(red_path, label="red color")
# Query with the same image
results = engine.query_image(red_path, k=3)
assert len(results) >= 1
# First result should be highly similar (it's the same image)
assert results[0][1] > 0.9
engine.close()
def test_text_to_image_retrieval(self, engine, tmp_path):
"""Text query should find taught images via cross-modal search."""
from PIL import Image
# Teach images with labels
red = Image.new("RGB", (224, 224), color=(255, 0, 0))
red_path = str(tmp_path / "red.png")
red.save(red_path)
engine.teach_image(red_path, label="red")
blue = Image.new("RGB", (224, 224), color=(0, 0, 255))
blue_path = str(tmp_path / "blue.png")
blue.save(blue_path)
engine.teach_image(blue_path, label="blue")
# Text query for "red" should find red-related neurons
results = engine.query_text_to_image("red color", k=5)
assert len(results) >= 1
engine.close()
def test_multimodal_convergence(self, engine, tmp_path):
"""The convergence loop should work with CLIP vectors."""
from PIL import Image
# Teach some images
for color_name, color in [("red", (255, 0, 0)), ("blue", (0, 0, 255)),
("green", (0, 255, 0))]:
img = Image.new("RGB", (224, 224), color=color)
path = str(tmp_path / f"{color_name}.png")
img.save(path)
engine.teach_image(path, label=color_name)
# Convergence should work on CLIP vectors
vec = engine.multimodal.encode_text("colors")
result = engine.convergence.converge(vec)
# Should find some neurons (may or may not converge at small scale)
assert result is not None
engine.close()