File size: 6,281 Bytes
4b445f6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b9da50c
4b445f6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
"""
Tests for the RAG (Retrieval-Augmented Generation) pipeline.

These tests verify:
1. Code chunking splits files correctly with overlap
2. ChromaDB indexing stores documents (with mocked embeddings)
3. Retrieval returns context for queries (with mocked embeddings)
4. Edge cases: empty files, very large files, non-existent collections

IMPORTANT: All tests mock embed_texts() to avoid loading the
sentence-transformers model, which takes ~60 seconds on first load.
"""

from unittest.mock import patch

import pytest

from app.context.embedder import chunk_code
from app.context.indexer import _collection_name, index_repo_files
from app.context.retriever import retrieve_context

# ─── Code Chunking Tests ─────────────────────────────────────────────────


class TestCodeChunking:
    def test_small_file_single_chunk(self):
        """A file smaller than chunk_size should produce one chunk."""
        code = "\n".join(f"line_{i} = {i}" for i in range(20))
        chunks = chunk_code(code, "small.py", chunk_size=60)
        assert len(chunks) == 1
        assert chunks[0]["filepath"] == "small.py"
        assert chunks[0]["start_line"] == 1
        assert "# File: small.py" in chunks[0]["text"]

    def test_large_file_multiple_chunks(self):
        """A file larger than chunk_size should produce multiple overlapping chunks."""
        code = "\n".join(f"line_{i} = {i}" for i in range(150))
        chunks = chunk_code(code, "large.py", chunk_size=60)
        assert len(chunks) >= 2

        if len(chunks) >= 2:
            first_end = chunks[0]["end_line"]
            second_start = chunks[1]["start_line"]
            assert second_start < first_end  # Overlap exists

    def test_chunk_includes_filepath_in_text(self):
        """Each chunk should include the filepath as a header for context."""
        code = "\n".join(f"line_{i} = {i}" for i in range(10))
        chunks = chunk_code(code, "src/utils/helper.py")
        assert len(chunks) >= 1
        assert "# File: src/utils/helper.py" in chunks[0]["text"]

    def test_skips_nearly_empty_chunks(self):
        """Chunks with fewer than 5 non-empty lines should be skipped."""
        code = "a = 1\n" + "\n" * 8 + "b = 2\n" + "\n" * 8 + "c = 3\n"
        chunks = chunk_code(code, "sparse.py", chunk_size=10)
        assert len(chunks) == 0

    def test_chunk_metadata_has_line_numbers(self):
        """Each chunk should have correct start_line and end_line."""
        code = "\n".join(f"x_{i} = {i}" for i in range(100))
        chunks = chunk_code(code, "numbered.py", chunk_size=30)
        assert chunks[0]["start_line"] == 1
        assert chunks[0]["end_line"] == 30
        if len(chunks) >= 2:
            assert chunks[1]["start_line"] == 21


# ─── Collection Naming Tests ─────────────────────────────────────────────


class TestCollectionNaming:
    def test_converts_repo_name_to_valid_collection(self):
        """Repo names with / and - should become valid ChromaDB collection names."""
        name = _collection_name("ninjacode911/code-guard-test")
        assert "/" not in name
        assert "-" not in name
        assert name.startswith("repo_")

    def test_truncates_long_names(self):
        """Collection names must be max 63 characters (ChromaDB limit)."""
        long_name = "organization/" + "a" * 100
        name = _collection_name(long_name)
        assert len(name) <= 63


# ─── ChromaDB Indexer Tests ──────────────────────────────────────────────


class TestIndexer:
    @pytest.mark.asyncio
    async def test_index_repo_files_returns_collection_name(self):
        """Indexing should return a valid collection name."""
        files = {
            "app.py": "\n".join(f"x_{i} = {i}" for i in range(25)),
        }
        with patch("app.context.indexer.embed_texts", return_value=[[0.1] * 384]):
            name = await index_repo_files("ninjacode911/test-repo", files)
        assert name.startswith("repo_")

    @pytest.mark.asyncio
    async def test_index_handles_empty_files(self):
        """Empty file dict should not crash."""
        name = await index_repo_files("ninjacode911/empty-repo", {})
        assert name.startswith("repo_")

    @pytest.mark.asyncio
    async def test_index_skips_large_files(self):
        """Files over 100KB should be skipped to avoid memory issues."""
        files = {
            "huge.py": "x = 1\n" * 50000,
            "small.py": "\n".join(f"y_{i} = {i}" for i in range(25)),
        }
        with patch("app.context.indexer.embed_texts", return_value=[[0.1] * 384]) as mock_embed:
            await index_repo_files("ninjacode911/skip-test", files)
            if mock_embed.called:
                texts = mock_embed.call_args[0][0]
                for text in texts:
                    assert "huge.py" not in text


# ─── ChromaDB Retriever Tests ────────────────────────────────────────────


class TestRetriever:
    @pytest.mark.asyncio
    async def test_retrieve_nonexistent_collection_returns_empty(self):
        """Querying a non-existent collection should return empty string."""
        with patch("app.context.retriever.embed_texts", return_value=[[0.1] * 384]):
            result = await retrieve_context("nonexistent_xyz_collection", "query")
        assert result == ""

    @pytest.mark.asyncio
    async def test_retrieve_returns_string(self):
        """Successful indexing + retrieval should return a string."""
        files = {
            "app.py": "\n".join(f"code_line_{i} = {i}" for i in range(25)),
        }
        with patch("app.context.indexer.embed_texts", return_value=[[0.1] * 384]):
            collection_name = await index_repo_files("ninjacode911/ret-test", files)

        with patch("app.context.retriever.embed_texts", return_value=[[0.1] * 384]):
            result = await retrieve_context(collection_name, "SQL query")

        assert isinstance(result, str)