File size: 8,687 Bytes
0da0699
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b616cc1
 
 
 
 
 
c44df3b
 
 
8da917e
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
# backend/tests/test_enumerate_query.py
# Unit tests for the enumeration query classifier (Fix 1) and
# the portfolio-relevance helper (Fix 2 Rule 1).
#
# All tests are pure-Python; no network calls, no Qdrant, no embedder.

import pytest
from unittest.mock import AsyncMock, MagicMock, patch

from app.pipeline.nodes.enumerate_query import (
    _has_enumeration_intent,
    _extract_source_types,
    make_enumerate_query_node,
)
from app.core.portfolio_context import is_portfolio_relevant

# Patch target for LangGraph's stream writer, which requires a runnable context
# that doesn't exist in unit tests.
_WRITER_PATCH = "app.pipeline.nodes.enumerate_query.get_stream_writer"


# ---------------------------------------------------------------------------
# _has_enumeration_intent
# ---------------------------------------------------------------------------


class TestHasEnumerationIntent:
    def test_list_all_projects(self):
        assert _has_enumeration_intent("list all projects") is True

    def test_list_projects_no_all(self):
        assert _has_enumeration_intent("list projects") is True

    def test_show_all_blogs(self):
        assert _has_enumeration_intent("show all blog posts") is True

    def test_how_many_blogs(self):
        assert _has_enumeration_intent("how many blog posts do you have") is True

    def test_count_projects(self):
        assert _has_enumeration_intent("count projects") is True

    def test_enumerate_skills(self):
        assert _has_enumeration_intent("enumerate all skills") is True

    def test_give_me_a_list_of(self):
        assert _has_enumeration_intent("give me a list of your projects") is True

    def test_what_are_all_the_projects(self):
        # trailing-regex pattern: "what are all the X"
        assert _has_enumeration_intent("what are all the projects") is True

    def test_which_are_all_the_blogs(self):
        # Requires "all" keyword — the trailing regex gate prevents over-triggering.
        assert _has_enumeration_intent("which are all the blog posts") is True

    def test_regular_how_query_no_intent(self):
        assert _has_enumeration_intent("how does TextOps work") is False

    def test_explain_query_no_intent(self):
        assert _has_enumeration_intent("explain the architecture of PersonaBot") is False

    def test_what_is_query_no_intent(self):
        assert _has_enumeration_intent("what is echo-echo") is False

    def test_tell_me_about_no_intent(self):
        assert _has_enumeration_intent("tell me about your background") is False

    def test_empty_string(self):
        assert _has_enumeration_intent("") is False


# ---------------------------------------------------------------------------
# _extract_source_types
# ---------------------------------------------------------------------------


class TestExtractSourceTypes:
    def test_projects(self):
        types = _extract_source_types("list all projects")
        assert "project" in types

    def test_blogs(self):
        types = _extract_source_types("show all blog posts")
        assert "blog" in types

    def test_skills_cv(self):
        types = _extract_source_types("list all your skills")
        assert "cv" in types

    def test_generic_returns_empty(self):
        # "everything" or "all" without a type token → [] meaning scroll all types
        types = _extract_source_types("list everything")
        assert types == []

    def test_github_repos(self):
        types = _extract_source_types("show all github repos")
        assert "github" in types

    def test_work_experience(self):
        types = _extract_source_types("list all work experience")
        assert "cv" in types


# ---------------------------------------------------------------------------
# make_enumerate_query_node
# ---------------------------------------------------------------------------


@pytest.mark.asyncio
async def test_non_enumeration_query_passes_through():
    """A regular query must exit the node with is_enumeration_query=False."""
    mock_vs = MagicMock()
    mock_vs.scroll_by_source_type = MagicMock(return_value=[])

    node = make_enumerate_query_node(mock_vs)
    state = {"query": "how does TextOps work", "retrieval_attempts": 0}
    with patch(_WRITER_PATCH, return_value=MagicMock()):
        result = node(state)

    assert result["is_enumeration_query"] is False
    # Vector store must NOT be called for normal queries (zero cost guarantee).
    mock_vs.scroll_by_source_type.assert_not_called()


@pytest.mark.asyncio
async def test_enumeration_query_sets_flag_and_populates_chunks():
    """An enumeration query must call scroll and set is_enumeration_query=True."""
    chunk_a = {
        "text": "TextOps is a CLI toolkit.",
        "metadata": {"source_title": "TextOps", "source_type": "project", "doc_id": "textops-1"},
    }
    chunk_b = {
        "text": "Echo-Echo is a WebRTC demo.",
        "metadata": {"source_title": "Echo-Echo", "source_type": "project", "doc_id": "echo-1"},
    }
    mock_vs = MagicMock()
    mock_vs.scroll_by_source_type = MagicMock(return_value=[chunk_a, chunk_b])

    node = make_enumerate_query_node(mock_vs)
    state = {"query": "list all projects", "retrieval_attempts": 0}
    with patch(_WRITER_PATCH, return_value=MagicMock()):
        result = node(state)

    assert result["is_enumeration_query"] is True
    assert len(result["reranked_chunks"]) == 2
    mock_vs.scroll_by_source_type.assert_called_once()


@pytest.mark.asyncio
async def test_enumeration_deduplicates_by_source_title():
    """Duplicate source_title chunks must be collapsed to one representative."""
    chunk_a = {
        "text": "TextOps chunk 1",
        "metadata": {"source_title": "TextOps", "source_type": "project", "doc_id": "textops-1"},
    }
    chunk_b = {
        "text": "TextOps chunk 2",
        "metadata": {"source_title": "TextOps", "source_type": "project", "doc_id": "textops-2"},
    }
    mock_vs = MagicMock()
    mock_vs.scroll_by_source_type = MagicMock(return_value=[chunk_a, chunk_b])

    node = make_enumerate_query_node(mock_vs)
    state = {"query": "list all projects", "retrieval_attempts": 0}
    with patch(_WRITER_PATCH, return_value=MagicMock()):
        result = node(state)

    assert result["is_enumeration_query"] is True
    assert len(result["reranked_chunks"]) == 1


@pytest.mark.asyncio
async def test_enumeration_empty_scroll_returns_not_found():
    """When Qdrant returns no chunks, is_enumeration_query stays False (no results to list)."""
    mock_vs = MagicMock()
    mock_vs.scroll_by_source_type = MagicMock(return_value=[])

    node = make_enumerate_query_node(mock_vs)
    state = {"query": "list all projects", "retrieval_attempts": 0}
    with patch(_WRITER_PATCH, return_value=MagicMock()):
        result = node(state)

    # With no chunks, the node does not commit to enumeration path; falls to RAG.
    assert result["is_enumeration_query"] is False


# ---------------------------------------------------------------------------
# is_portfolio_relevant (Fix 2 Rule 1)
# ---------------------------------------------------------------------------


class TestIsPortfolioRelevant:
    def test_known_project_name(self):
        assert is_portfolio_relevant("how does textops work") is True

    def test_known_project_variant(self):
        assert is_portfolio_relevant("tell me about echo echo") is True

    def test_known_technology(self):
        assert is_portfolio_relevant("explain the use of langchain in your stack") is True

    def test_known_organisation(self):
        assert is_portfolio_relevant("what did you do at vk live") is True

    def test_unrelated_query(self):
        assert is_portfolio_relevant("what is the weather in london") is False

    def test_generic_question(self):
        assert is_portfolio_relevant("tell me a joke") is False

    def test_empty_string(self):
        assert is_portfolio_relevant("") is False

    def test_resume_intent_keywords_are_relevant(self):
        assert is_portfolio_relevant("tell me about his work experience") is True

    def test_stt_typo_work_experience_is_still_relevant(self):
        assert is_portfolio_relevant("tell me about his walk experience") is True

    def test_tech_stack_intent_is_relevant(self):
        assert is_portfolio_relevant("Could you tell me about his tech stack?") is True

    def test_professional_setting_work_experience_is_relevant(self):
        assert is_portfolio_relevant("What work experience do you have in a professional setting") is True

    def test_tech_stack_use_phrase_is_relevant(self):
        assert is_portfolio_relevant("What tech stack does he use") is True