"""
Tests for multimodal encoding and cross-modal retrieval.

Tests the MultimodalEncoder (CLIP-based) and the engine's
multimodal teach/query methods.

CLIP model tests are marked with @pytest.mark.slow — they
download the model on first run (~400MB).
"""

import sys
import os
import numpy as np
import pytest

sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'src'))


# --- Unit tests for MultimodalEncoder (no model download) ---

class TestMultimodalEncoderUnit:
    def test_import_succeeds(self):
        """The multimodal module should import without errors."""
        from multimodal import MultimodalEncoder, CLIP_DIM
        assert CLIP_DIM == 512

    def test_lazy_model_loading(self):
        """Model should not load until first encode call."""
        from multimodal import MultimodalEncoder
        enc = MultimodalEncoder.__new__(MultimodalEncoder)
        enc._model = None
        enc.model_name = "clip-ViT-B-32"
        enc.device = "cpu"
        enc.dim = 512
        # _model should still be None (lazy)
        assert enc._model is None


# --- Integration tests with CLIP model ---

@pytest.mark.slow
class TestMultimodalCLIP:
    @pytest.fixture(scope="class")
    def encoder(self):
        from multimodal import MultimodalEncoder
        return MultimodalEncoder(device="cpu")

    def test_text_encoding_shape(self, encoder):
        """Text encoding should produce a 512-dim normalized vector."""
        vec = encoder.encode_text("a cat sitting on a mat")
        assert vec.shape == (512,)
        assert abs(np.linalg.norm(vec) - 1.0) < 1e-5

    def test_text_similarity(self, encoder):
        """Similar texts should produce similar vectors."""
        v1 = encoder.encode_text("a cat sitting on a mat")
        v2 = encoder.encode_text("a kitten on a rug")
        v3 = encoder.encode_text("the stock market crashed today")
        sim_close = float(np.dot(v1, v2))
        sim_far = float(np.dot(v1, v3))
        assert sim_close > sim_far

    def test_image_encoding(self, encoder, tmp_path):
        """Image encoding should produce a 512-dim normalized vector."""
        # Create a simple test image
        from PIL import Image
        img = Image.new("RGB", (224, 224), color=(255, 0, 0))
        path = str(tmp_path / "red.png")
        img.save(path)

        vec = encoder.encode_image(path)
        assert vec.shape == (512,)
        assert abs(np.linalg.norm(vec) - 1.0) < 1e-5

    def test_cross_modal_similarity(self, encoder, tmp_path):
        """Text and images of similar content should be close in vector space."""
        from PIL import Image

        # Create a red image and a blue image
        red_img = Image.new("RGB", (224, 224), color=(255, 0, 0))
        blue_img = Image.new("RGB", (224, 224), color=(0, 0, 255))
        red_path = str(tmp_path / "red.png")
        blue_path = str(tmp_path / "blue.png")
        red_img.save(red_path)
        blue_img.save(blue_path)

        # "red" text should be closer to red image than blue image
        red_text = encoder.encode_text("a solid red image")
        red_image = encoder.encode_image(red_path)
        blue_image = encoder.encode_image(blue_path)

        sim_match = float(np.dot(red_text, red_image))
        sim_mismatch = float(np.dot(red_text, blue_image))
        # CLIP should distinguish these
        assert sim_match > sim_mismatch or abs(sim_match - sim_mismatch) < 0.1

    def test_batch_text_encoding(self, encoder):
        """Batch encoding should produce same results as individual."""
        texts = ["hello world", "cat on mat", "quantum physics"]
        batch = encoder.encode_batch_text(texts)
        assert batch.shape == (3, 512)

        individual = encoder.encode_text(texts[0])
        # Should be very close (floating point differences from batching)
        sim = float(np.dot(batch[0], individual))
        assert sim > 0.99

    def test_pil_image_direct(self, encoder):
        """Should accept PIL Image objects directly."""
        from PIL import Image
        img = Image.new("RGB", (100, 100), color=(0, 255, 0))
        vec = encoder.encode_image(img)
        assert vec.shape == (512,)

    def test_numpy_image(self, encoder):
        """Should accept numpy arrays."""
        arr = np.zeros((100, 100, 3), dtype=np.uint8)
        arr[:, :, 1] = 255  # green
        vec = encoder.encode_image(arr)
        assert vec.shape == (512,)


# --- Engine multimodal integration tests ---

@pytest.mark.slow
class TestEngineMultimodal:
    @pytest.fixture
    def engine(self, tmp_path):
        from engine import Engine
        from multimodal import CLIP_DIM
        # Use CLIP dim (512) for multimodal engine
        engine = Engine(data_dir=str(tmp_path), dim=CLIP_DIM)
        return engine

    def test_teach_image(self, engine, tmp_path):
        """Teaching an image should create a neuron."""
        from PIL import Image
        img = Image.new("RGB", (224, 224), color=(255, 0, 0))
        path = str(tmp_path / "test.png")
        img.save(path)

        before = engine.db.count()
        neuron = engine.teach_image(path, label="a red square")
        after = engine.db.count()

        assert after > before
        assert neuron.vector.shape == (512,)
        engine.close()

    def test_teach_image_with_label_creates_link(self, engine, tmp_path):
        """Teaching with a label should create two neurons linked by successor."""
        from PIL import Image
        img = Image.new("RGB", (224, 224), color=(0, 0, 255))
        path = str(tmp_path / "blue.png")
        img.save(path)

        neuron = engine.teach_image(path, label="a blue square")
        # Should have created 2 neurons (image + label)
        assert engine.db.count() >= 2

        # Image neuron should have successor (the label)
        refreshed = engine.db.get(neuron.id)
        assert len(refreshed.successors) >= 1
        engine.close()

    def test_query_image(self, engine, tmp_path):
        """Querying with an image should find similar neurons."""
        from PIL import Image

        # Teach a red image
        red = Image.new("RGB", (224, 224), color=(255, 0, 0))
        red_path = str(tmp_path / "red.png")
        red.save(red_path)
        engine.teach_image(red_path, label="red color")

        # Query with the same image
        results = engine.query_image(red_path, k=3)
        assert len(results) >= 1
        # First result should be highly similar (it's the same image)
        assert results[0][1] > 0.9
        engine.close()

    def test_text_to_image_retrieval(self, engine, tmp_path):
        """Text query should find taught images via cross-modal search."""
        from PIL import Image

        # Teach images with labels
        red = Image.new("RGB", (224, 224), color=(255, 0, 0))
        red_path = str(tmp_path / "red.png")
        red.save(red_path)
        engine.teach_image(red_path, label="red")

        blue = Image.new("RGB", (224, 224), color=(0, 0, 255))
        blue_path = str(tmp_path / "blue.png")
        blue.save(blue_path)
        engine.teach_image(blue_path, label="blue")

        # Text query for "red" should find red-related neurons
        results = engine.query_text_to_image("red color", k=5)
        assert len(results) >= 1
        engine.close()

    def test_multimodal_convergence(self, engine, tmp_path):
        """The convergence loop should work with CLIP vectors."""
        from PIL import Image

        # Teach some images
        for color_name, color in [("red", (255, 0, 0)), ("blue", (0, 0, 255)),
                                   ("green", (0, 255, 0))]:
            img = Image.new("RGB", (224, 224), color=color)
            path = str(tmp_path / f"{color_name}.png")
            img.save(path)
            engine.teach_image(path, label=color_name)

        # Convergence should work on CLIP vectors
        vec = engine.multimodal.encode_text("colors")
        result = engine.convergence.converge(vec)
        # Should find some neurons (may or may not converge at small scale)
        assert result is not None
        engine.close()