File size: 10,743 Bytes
a9c06ad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
# backend/tests/integration/test_raptor.py
# Integration tests for RAPTOR hierarchical summarisation.
#
# Task 8: Validates that the RAPTOR builder produces coherent hierarchies
# with proper clustering, summarisation, and embedding integration.
#
# Tests run with synthetic corpus fixtures to avoid dependency on real
# knowledge base content.

import os
import sys
import pytest
import numpy as np
from unittest.mock import AsyncMock, MagicMock, patch
# Add parent directory to path so ingestion module is accessible
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '../../..'))


from ingestion.raptor import RaptorBuilder, _n_clusters, _gmm_soft_assign


class TestRaptorClustering:
    """Unit tests for RAPTOR clustering logic."""

    def test_n_clusters_formula(self):
        """sqrt(N) heuristic with bounds."""
        assert _n_clusters(4) == 2
        assert _n_clusters(100) == 10
        assert _n_clusters(400) == 20
        assert _n_clusters(500) == 20
        assert _n_clusters(1) == 2

    def test_gmm_soft_assign_shape(self):
        """GMM returns correct shapes for responsibilities and labels."""
        rng = np.random.default_rng(seed=42)
        embeddings = rng.standard_normal((20, 384))
        labels, responsibilities = _gmm_soft_assign(embeddings, n_components=3)

        assert labels.shape == (20,)
        assert responsibilities.shape == (20, 3)
        assert np.all((labels >= 0) & (labels < 3))
        assert np.allclose(responsibilities.sum(axis=1), 1.0)

    def test_gmm_cluster_determinism(self):
        """GMM with fixed random_state is deterministic."""
        rng = np.random.default_rng(seed=42)
        embeddings = rng.standard_normal((15, 384))
        labels1, _ = _gmm_soft_assign(embeddings, n_components=2, random_state=42)
        labels2, _ = _gmm_soft_assign(embeddings, n_components=2, random_state=42)

        np.testing.assert_array_equal(labels1, labels2)


class TestRaptorSummarisation:
    """Integration tests for RAPTOR cluster summarisation."""

    @pytest.fixture
    def synthetic_chunks(self):
        """10-item fixture: 5 project chunks + 5 blog chunks."""
        return [
            {
                "id": f"chunk_{i}",
                "text": f"Project {i}: Built a Python async service using FastAPI and PostgreSQL. "
                        f"Key features include real-time validation, caching layers, and REST API.",
                "metadata": {
                    "doc_id": f"project_{i % 3}",
                    "source_title": f"Project {i % 3}",
                    "source_type": "project",
                    "chunk_index": i,
                },
            }
            for i in range(5)
        ] + [
            {
                "id": f"blog_{i}",
                "text": f"Blog Post {i}: Exploring RAG systems with LangGraph, semantic caching, "
                        f"and multi-modal retrieval. Discusses production challenges and solutions.",
                "metadata": {
                    "doc_id": f"blog_{i}",
                    "source_title": f"Blog {i}",
                    "source_type": "blog",
                    "chunk_index": i,
                },
            }
            for i in range(5)
        ]

    @pytest.fixture
    def synthetic_embeddings(self):
        """10 random 384-dim vectors (BGE-small dimension)."""
        rng = np.random.default_rng(seed=42)
        return rng.standard_normal((10, 384)).astype(np.float32)

    def test_raptor_builder_initialization(self):
        """RaptorBuilder instantiates without errors."""
        mock_vector_store = MagicMock()
        mock_embedder = MagicMock()
        mock_gemini = MagicMock()

        builder = RaptorBuilder(
            store=mock_vector_store,
            embedder=mock_embedder,
            gemini_client=mock_gemini,
        )

        assert builder._store is mock_vector_store

    @pytest.mark.asyncio
    async def test_raptor_build_creates_hierarchy(
        self,
        synthetic_chunks,
        synthetic_embeddings,
    ):
        """
        RAPTOR build produces hierarchical summary nodes.

        Assertions:
          • Cluster count is sqrt(N) within bounds
          • No degenerate single-item clusters
          • Summary nodes are created and upserted
        """
        mock_vector_store = MagicMock()
        mock_embedder = MagicMock()
        mock_gemini = MagicMock()

        def mock_summarise(text: str):
            return "Summary of cluster content"

        mock_gemini.summarise = AsyncMock(side_effect=mock_summarise)

        # Mock embedder to return synthetic vectors
        def mock_embed(texts, is_query=False):
            rng = np.random.default_rng(seed=42)
            return rng.standard_normal((len(texts), 384)).astype(np.float32)

        mock_embedder.embed = AsyncMock(side_effect=mock_embed)
        mock_embedder.embed_texts_async = mock_embedder.embed

        # Mock vector store to capture upserts
        upserted_count = [0]

        def capture_upsert(nodes, dense_embeddings, sparse_embeddings=None):
            # Detect raptor_summary nodes by inspecting their metadata.
            raptor_nodes = [
                n for n in nodes
                if n.get("metadata", {}).get("chunk_type") == "raptor_summary"
            ]
            if raptor_nodes:
                upserted_count[0] = len(raptor_nodes)
            return [f"uuid_{i}" for i in range(len(nodes))]

        mock_vector_store.upsert_chunks = MagicMock(side_effect=capture_upsert)

        builder = RaptorBuilder(
            store=mock_vector_store,
            embedder=mock_embedder,
            gemini_client=mock_gemini,
        )

        leaf_uuids = [f"uuid_chunk_{i}" for i in range(len(synthetic_chunks))]

        await builder.build(
            leaf_chunks=synthetic_chunks,
            dense_embeddings=synthetic_embeddings.tolist(),
            leaf_uuids=leaf_uuids,
        )

        # At least one summary node should be created
        assert upserted_count[0] > 0 or len(synthetic_chunks) < 2

    @pytest.mark.asyncio
    async def test_raptor_child_leaf_mapping(self, synthetic_chunks, synthetic_embeddings):
        """Child leaf IDs correctly reference original chunks."""
        mock_vector_store = MagicMock()
        mock_embedder = MagicMock()
        mock_gemini = MagicMock()

        def mock_summarise(text: str):
            return "Cluster summary"

        mock_gemini.summarise = AsyncMock(side_effect=mock_summarise)

        def mock_embed(texts, is_query=False):
            rng = np.random.default_rng(seed=43)
            return rng.standard_normal((len(texts), 384)).astype(np.float32)

        mock_embedder.embed = AsyncMock(side_effect=mock_embed)
        mock_embedder.embed_texts_async = mock_embedder.embed

        # Capture child_leaf_ids for validation
        captured_mappings = []

        def capture_upsert(nodes, dense_embeddings, sparse_embeddings=None):
            for node in nodes:
                if node.get("metadata", {}).get("chunk_type") == "raptor_summary":
                    child_ids = node.get("metadata", {}).get("child_leaf_ids", [])
                    captured_mappings.append(child_ids)
            return [f"uuid_{i}" for i in range(len(nodes))]

        mock_vector_store.upsert_chunks = MagicMock(side_effect=capture_upsert)

        builder = RaptorBuilder(
            store=mock_vector_store,
            embedder=mock_embedder,
            gemini_client=mock_gemini,
        )

        leaf_uuids = [f"uuid_chunk_{i}" for i in range(len(synthetic_chunks))]

        await builder.build(
            leaf_chunks=synthetic_chunks,
            dense_embeddings=synthetic_embeddings.tolist(),
            leaf_uuids=leaf_uuids,
        )

        # All child references should use leaf UUIDs
        for child_list in captured_mappings:
            for child_uuid in child_list:
                assert child_uuid in leaf_uuids

    def test_raptor_builder_store_reference(self):
        """RaptorBuilder stores reference to vector store."""
        mock_vector_store = MagicMock()
        mock_embedder = MagicMock()

        builder = RaptorBuilder(
            store=mock_vector_store,
            embedder=mock_embedder,
        )

        assert builder._store is mock_vector_store


class TestRaptorErrorHandling:
    """Robustness tests for RAPTOR failure modes."""

    @pytest.mark.asyncio
    async def test_raptor_graceful_gemini_failure(self):
        """If Gemini fails, RAPTOR continues with fallback summary."""
        mock_vector_store = MagicMock()
        mock_embedder = MagicMock()
        mock_gemini = MagicMock()

        def mock_summarise_fail(text: str):
            raise RuntimeError("Gemini API timeout")

        mock_gemini.summarise = AsyncMock(side_effect=mock_summarise_fail)

        def mock_embed(texts, is_query=False):
            rng = np.random.default_rng(seed=44)
            return rng.standard_normal((len(texts), 384)).astype(np.float32)

        mock_embedder.embed = AsyncMock(side_effect=mock_embed)
        mock_embedder.embed_texts_async = mock_embedder.embed

        mock_vector_store.upsert_chunks = MagicMock(return_value=[])

        builder = RaptorBuilder(
            store=mock_vector_store,
            embedder=mock_embedder,
            gemini_client=mock_gemini,
        )

        chunks = [
            {
                "id": "c1",
                "text": "Sample chunk about project architecture",
                "metadata": {"doc_id": "d1", "source_type": "blog"},
            }
        ]
        rng = np.random.default_rng(seed=42)
        embeddings = rng.standard_normal((1, 384)).astype(np.float32)

        # Should handle gracefully
        try:
            await builder.build(
                leaf_chunks=chunks,
                dense_embeddings=embeddings.tolist(),
                leaf_uuids=["uuid_c1"],
            )
        except Exception:
            pytest.fail("RAPTOR should handle Gemini failure gracefully")

    @pytest.mark.asyncio
    async def test_raptor_empty_corpus(self):
        """Empty chunk list skips RAPTOR."""
        mock_vector_store = MagicMock()
        mock_embedder = MagicMock()

        mock_vector_store.upsert_chunks = MagicMock(return_value={})

        builder = RaptorBuilder(
            store=mock_vector_store,
            embedder=mock_embedder,
        )

        await builder.build(
            leaf_chunks=[],
            dense_embeddings=[],
            leaf_uuids=[],
        )

        # Should complete without error
        assert mock_vector_store.upsert_chunks.call_count == 0 or len(
            mock_vector_store.upsert_chunks.call_args_list[0][0][0]
        ) == 0