GitHub Actions commited on
Commit
b616cc1
·
1 Parent(s): 1f61582

Deploy 73a273d

Browse files
app/core/portfolio_context.py CHANGED
@@ -79,10 +79,65 @@ KNOWN_ORGS: frozenset[str] = frozenset({
79
  "github", "groq", "huggingface", "vercel", "cloudflare", "qdrant cloud",
80
  })
81
 
 
 
 
 
 
 
 
 
 
 
82
  # ---------------------------------------------------------------------------
83
  # All known portfolio nouns in one flat set for O(1) membership checks
84
  # ---------------------------------------------------------------------------
85
- ALL_PORTFOLIO_NOUNS: frozenset[str] = KNOWN_PROJECTS | KNOWN_TECHNOLOGIES | KNOWN_ORGS
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
 
87
  # Compact context block passed to Gemini when generating a specific not-found
88
  # suggestion. One sentence per major entity class — tight token budget.
@@ -111,7 +166,7 @@ def is_portfolio_relevant(query: str) -> bool:
111
  tokens = re.findall(r"[a-z0-9]+", query.lower())
112
  # Single-token check
113
  for token in tokens:
114
- if token in ALL_PORTFOLIO_NOUNS:
115
  return True
116
  # Bigram check — catches "vk live", "text ops", "echo echo"
117
  for a, b in zip(tokens, tokens[1:]):
 
79
  "github", "groq", "huggingface", "vercel", "cloudflare", "qdrant cloud",
80
  })
81
 
82
+ # ---------------------------------------------------------------------------
83
+ # Intent nouns that should always route to portfolio retrieval paths
84
+ # (especially resume/CV questions that may not mention named entities).
85
+ # ---------------------------------------------------------------------------
86
+ KNOWN_INTENTS: frozenset[str] = frozenset({
87
+ "work", "experience", "work experience", "career", "employment", "job", "role",
88
+ "internship", "internships", "skills", "skill", "education", "degree", "university",
89
+ "resume", "cv", "background", "certification", "certifications",
90
+ })
91
+
92
  # ---------------------------------------------------------------------------
93
  # All known portfolio nouns in one flat set for O(1) membership checks
94
  # ---------------------------------------------------------------------------
95
+ ALL_PORTFOLIO_NOUNS: frozenset[str] = KNOWN_PROJECTS | KNOWN_TECHNOLOGIES | KNOWN_ORGS | KNOWN_INTENTS
96
+
97
+ # Single-token subset for typo-tolerant matching (e.g. "walk" -> "work").
98
+ _SINGLE_TOKEN_NOUNS: frozenset[str] = frozenset({n for n in ALL_PORTFOLIO_NOUNS if " " not in n})
99
+
100
+
101
+ def _is_edit_distance_leq_one(a: str, b: str) -> bool:
102
+ """Fast check for Levenshtein distance <= 1 (substitute/insert/delete)."""
103
+ if a == b:
104
+ return True
105
+ la, lb = len(a), len(b)
106
+ if abs(la - lb) > 1:
107
+ return False
108
+
109
+ if la == lb:
110
+ mismatches = sum(1 for x, y in zip(a, b) if x != y)
111
+ return mismatches <= 1
112
+
113
+ # Ensure a is shorter for insert/delete logic.
114
+ if la > lb:
115
+ a, b = b, a
116
+ la, lb = lb, la
117
+
118
+ i = j = 0
119
+ mismatch = 0
120
+ while i < la and j < lb:
121
+ if a[i] == b[j]:
122
+ i += 1
123
+ j += 1
124
+ continue
125
+ mismatch += 1
126
+ if mismatch > 1:
127
+ return False
128
+ j += 1
129
+ return True
130
+
131
+
132
+ def _token_matches_known_portfolio_noun(token: str) -> bool:
133
+ if token in ALL_PORTFOLIO_NOUNS:
134
+ return True
135
+ if len(token) < 4:
136
+ return False
137
+ for known in _SINGLE_TOKEN_NOUNS:
138
+ if abs(len(token) - len(known)) <= 1 and _is_edit_distance_leq_one(token, known):
139
+ return True
140
+ return False
141
 
142
  # Compact context block passed to Gemini when generating a specific not-found
143
  # suggestion. One sentence per major entity class — tight token budget.
 
166
  tokens = re.findall(r"[a-z0-9]+", query.lower())
167
  # Single-token check
168
  for token in tokens:
169
+ if _token_matches_known_portfolio_noun(token):
170
  return True
171
  # Bigram check — catches "vk live", "text ops", "echo echo"
172
  for a, b in zip(tokens, tokens[1:]):
app/pipeline/nodes/retrieve.py CHANGED
@@ -111,6 +111,71 @@ _TYPE_REMAP: dict[str, str] = {
111
  "resume": "resume", # RC-3: explicit pass-through so resume chunks aren't "unknown"
112
  }
113
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
 
115
  def make_retrieve_node(
116
  vector_store: VectorStore, embedder: Embedder, reranker: Reranker
@@ -125,6 +190,7 @@ def make_retrieve_node(
125
  # cosine similarity against "PersonaBot RAG pipeline" passages; the rewrite
126
  # "What ML projects has Darshan built?" dramatically improves recall.
127
  retrieval_query = state.get("decontextualized_query") or query
 
128
 
129
  # Reuse the topic computed by the guard node — no recomputation needed.
130
  topic = state.get("query_topic") or ""
 
111
  "resume": "resume", # RC-3: explicit pass-through so resume chunks aren't "unknown"
112
  }
113
 
114
+ _FOCUS_VOCAB: frozenset[str] = frozenset(
115
+ {
116
+ keyword
117
+ for keys in _FOCUS_KEYWORDS.keys()
118
+ for keyword in keys
119
+ if " " not in keyword
120
+ }
121
+ )
122
+
123
+
124
+ def _edit_distance(a: str, b: str) -> int:
125
+ la, lb = len(a), len(b)
126
+ dp = list(range(lb + 1))
127
+ for i in range(1, la + 1):
128
+ prev = dp[0]
129
+ dp[0] = i
130
+ for j in range(1, lb + 1):
131
+ cur = dp[j]
132
+ cost = 0 if a[i - 1] == b[j - 1] else 1
133
+ dp[j] = min(dp[j] + 1, dp[j - 1] + 1, prev + cost)
134
+ prev = cur
135
+ return dp[lb]
136
+
137
+
138
+ def _best_focus_replacement(token: str) -> str | None:
139
+ best = None
140
+ best_score = 99
141
+ for candidate in _FOCUS_VOCAB:
142
+ if token[0] != candidate[0]:
143
+ continue
144
+ if abs(len(token) - len(candidate)) > 1:
145
+ continue
146
+ score = _edit_distance(token, candidate)
147
+ if score <= 2 and score < best_score:
148
+ best_score = score
149
+ best = candidate
150
+ return best
151
+
152
+
153
+ def _normalise_focus_typos(query: str) -> str:
154
+ """
155
+ Correct minor STT typos for intent words used by focused retrieval.
156
+
157
+ Example: "walk experience" -> "work experience".
158
+ """
159
+ tokens = query.lower().split()
160
+ if not tokens:
161
+ return query
162
+
163
+ corrected: list[str] = []
164
+ for token in tokens:
165
+ stripped = token.strip(".,!?;:\"'()[]{}")
166
+ if len(stripped) < 4 or stripped in _FOCUS_VOCAB:
167
+ corrected.append(token)
168
+ continue
169
+
170
+ replacement = _best_focus_replacement(stripped)
171
+
172
+ if replacement:
173
+ corrected.append(token.replace(stripped, replacement))
174
+ else:
175
+ corrected.append(token)
176
+
177
+ return " ".join(corrected)
178
+
179
 
180
  def make_retrieve_node(
181
  vector_store: VectorStore, embedder: Embedder, reranker: Reranker
 
190
  # cosine similarity against "PersonaBot RAG pipeline" passages; the rewrite
191
  # "What ML projects has Darshan built?" dramatically improves recall.
192
  retrieval_query = state.get("decontextualized_query") or query
193
+ retrieval_query = _normalise_focus_typos(retrieval_query)
194
 
195
  # Reuse the topic computed by the guard node — no recomputation needed.
196
  topic = state.get("query_topic") or ""
tests/test_enumerate_query.py CHANGED
@@ -211,3 +211,9 @@ class TestIsPortfolioRelevant:
211
 
212
  def test_empty_string(self):
213
  assert is_portfolio_relevant("") is False
 
 
 
 
 
 
 
211
 
212
  def test_empty_string(self):
213
  assert is_portfolio_relevant("") is False
214
+
215
+ def test_resume_intent_keywords_are_relevant(self):
216
+ assert is_portfolio_relevant("tell me about his work experience") is True
217
+
218
+ def test_stt_typo_work_experience_is_still_relevant(self):
219
+ assert is_portfolio_relevant("tell me about his walk experience") is True
tests/test_retrieve_query_normalization.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from app.pipeline.nodes.retrieve import _normalise_focus_typos
2
+
3
+
4
+ def test_walk_experience_normalises_to_work_experience() -> None:
5
+ assert _normalise_focus_typos("Can you tell me about his walk experience then?") == (
6
+ "can you tell me about his work experience then?"
7
+ )
8
+
9
+
10
+ def test_non_focus_text_is_not_overwritten() -> None:
11
+ original = "Tell me about widget orchestration internals"
12
+ assert _normalise_focus_typos(original) == original.lower()