guru / tests /test_multimodal.py

Upload folder using huggingface_hub

a5ae1ac verified about 1 month ago

8.12 kB

	"""
	Tests for multimodal encoding and cross-modal retrieval.

	Tests the MultimodalEncoder (CLIP-based) and the engine's
	multimodal teach/query methods.

	CLIP model tests are marked with @pytest.mark.slow — they
	download the model on first run (~400MB).
	"""

	import sys
	import os
	import numpy as np
	import pytest

	sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'src'))


	# --- Unit tests for MultimodalEncoder (no model download) ---

	class TestMultimodalEncoderUnit:
	def test_import_succeeds(self):
	"""The multimodal module should import without errors."""
	from multimodal import MultimodalEncoder, CLIP_DIM
	assert CLIP_DIM == 512

	def test_lazy_model_loading(self):
	"""Model should not load until first encode call."""
	from multimodal import MultimodalEncoder
	enc = MultimodalEncoder.__new__(MultimodalEncoder)
	enc._model = None
	enc.model_name = "clip-ViT-B-32"
	enc.device = "cpu"
	enc.dim = 512
	# _model should still be None (lazy)
	assert enc._model is None


	# --- Integration tests with CLIP model ---

	@pytest.mark.slow
	class TestMultimodalCLIP:
	@pytest.fixture(scope="class")
	def encoder(self):
	from multimodal import MultimodalEncoder
	return MultimodalEncoder(device="cpu")

	def test_text_encoding_shape(self, encoder):
	"""Text encoding should produce a 512-dim normalized vector."""
	vec = encoder.encode_text("a cat sitting on a mat")
	assert vec.shape == (512,)
	assert abs(np.linalg.norm(vec) - 1.0) < 1e-5

	def test_text_similarity(self, encoder):
	"""Similar texts should produce similar vectors."""
	v1 = encoder.encode_text("a cat sitting on a mat")
	v2 = encoder.encode_text("a kitten on a rug")
	v3 = encoder.encode_text("the stock market crashed today")
	sim_close = float(np.dot(v1, v2))
	sim_far = float(np.dot(v1, v3))
	assert sim_close > sim_far

	def test_image_encoding(self, encoder, tmp_path):
	"""Image encoding should produce a 512-dim normalized vector."""
	# Create a simple test image
	from PIL import Image
	img = Image.new("RGB", (224, 224), color=(255, 0, 0))
	path = str(tmp_path / "red.png")
	img.save(path)

	vec = encoder.encode_image(path)
	assert vec.shape == (512,)
	assert abs(np.linalg.norm(vec) - 1.0) < 1e-5

	def test_cross_modal_similarity(self, encoder, tmp_path):
	"""Text and images of similar content should be close in vector space."""
	from PIL import Image

	# Create a red image and a blue image
	red_img = Image.new("RGB", (224, 224), color=(255, 0, 0))
	blue_img = Image.new("RGB", (224, 224), color=(0, 0, 255))
	red_path = str(tmp_path / "red.png")
	blue_path = str(tmp_path / "blue.png")
	red_img.save(red_path)
	blue_img.save(blue_path)

	# "red" text should be closer to red image than blue image
	red_text = encoder.encode_text("a solid red image")
	red_image = encoder.encode_image(red_path)
	blue_image = encoder.encode_image(blue_path)

	sim_match = float(np.dot(red_text, red_image))
	sim_mismatch = float(np.dot(red_text, blue_image))
	# CLIP should distinguish these
	assert sim_match > sim_mismatch or abs(sim_match - sim_mismatch) < 0.1

	def test_batch_text_encoding(self, encoder):
	"""Batch encoding should produce same results as individual."""
	texts = ["hello world", "cat on mat", "quantum physics"]
	batch = encoder.encode_batch_text(texts)
	assert batch.shape == (3, 512)

	individual = encoder.encode_text(texts[0])
	# Should be very close (floating point differences from batching)
	sim = float(np.dot(batch[0], individual))
	assert sim > 0.99

	def test_pil_image_direct(self, encoder):
	"""Should accept PIL Image objects directly."""
	from PIL import Image
	img = Image.new("RGB", (100, 100), color=(0, 255, 0))
	vec = encoder.encode_image(img)
	assert vec.shape == (512,)

	def test_numpy_image(self, encoder):
	"""Should accept numpy arrays."""
	arr = np.zeros((100, 100, 3), dtype=np.uint8)
	arr[:, :, 1] = 255 # green
	vec = encoder.encode_image(arr)
	assert vec.shape == (512,)


	# --- Engine multimodal integration tests ---

	@pytest.mark.slow
	class TestEngineMultimodal:
	@pytest.fixture
	def engine(self, tmp_path):
	from engine import Engine
	from multimodal import CLIP_DIM
	# Use CLIP dim (512) for multimodal engine
	engine = Engine(data_dir=str(tmp_path), dim=CLIP_DIM)
	return engine

	def test_teach_image(self, engine, tmp_path):
	"""Teaching an image should create a neuron."""
	from PIL import Image
	img = Image.new("RGB", (224, 224), color=(255, 0, 0))
	path = str(tmp_path / "test.png")
	img.save(path)

	before = engine.db.count()
	neuron = engine.teach_image(path, label="a red square")
	after = engine.db.count()

	assert after > before
	assert neuron.vector.shape == (512,)
	engine.close()

	def test_teach_image_with_label_creates_link(self, engine, tmp_path):
	"""Teaching with a label should create two neurons linked by successor."""
	from PIL import Image
	img = Image.new("RGB", (224, 224), color=(0, 0, 255))
	path = str(tmp_path / "blue.png")
	img.save(path)

	neuron = engine.teach_image(path, label="a blue square")
	# Should have created 2 neurons (image + label)
	assert engine.db.count() >= 2

	# Image neuron should have successor (the label)
	refreshed = engine.db.get(neuron.id)
	assert len(refreshed.successors) >= 1
	engine.close()

	def test_query_image(self, engine, tmp_path):
	"""Querying with an image should find similar neurons."""
	from PIL import Image

	# Teach a red image
	red = Image.new("RGB", (224, 224), color=(255, 0, 0))
	red_path = str(tmp_path / "red.png")
	red.save(red_path)
	engine.teach_image(red_path, label="red color")

	# Query with the same image
	results = engine.query_image(red_path, k=3)
	assert len(results) >= 1
	# First result should be highly similar (it's the same image)
	assert results[0][1] > 0.9
	engine.close()

	def test_text_to_image_retrieval(self, engine, tmp_path):
	"""Text query should find taught images via cross-modal search."""
	from PIL import Image

	# Teach images with labels
	red = Image.new("RGB", (224, 224), color=(255, 0, 0))
	red_path = str(tmp_path / "red.png")
	red.save(red_path)
	engine.teach_image(red_path, label="red")

	blue = Image.new("RGB", (224, 224), color=(0, 0, 255))
	blue_path = str(tmp_path / "blue.png")
	blue.save(blue_path)
	engine.teach_image(blue_path, label="blue")

	# Text query for "red" should find red-related neurons
	results = engine.query_text_to_image("red color", k=5)
	assert len(results) >= 1
	engine.close()

	def test_multimodal_convergence(self, engine, tmp_path):
	"""The convergence loop should work with CLIP vectors."""
	from PIL import Image

	# Teach some images
	for color_name, color in [("red", (255, 0, 0)), ("blue", (0, 0, 255)),
	("green", (0, 255, 0))]:
	img = Image.new("RGB", (224, 224), color=color)
	path = str(tmp_path / f"{color_name}.png")
	img.save(path)
	engine.teach_image(path, label=color_name)

	# Convergence should work on CLIP vectors
	vec = engine.multimodal.encode_text("colors")
	result = engine.convergence.converge(vec)
	# Should find some neurons (may or may not converge at small scale)
	assert result is not None
	engine.close()