Spaces:
Running
Running
GitHub Actions commited on
Commit ·
b616cc1
1
Parent(s): 1f61582
Deploy 73a273d
Browse files
app/core/portfolio_context.py
CHANGED
|
@@ -79,10 +79,65 @@ KNOWN_ORGS: frozenset[str] = frozenset({
|
|
| 79 |
"github", "groq", "huggingface", "vercel", "cloudflare", "qdrant cloud",
|
| 80 |
})
|
| 81 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
# ---------------------------------------------------------------------------
|
| 83 |
# All known portfolio nouns in one flat set for O(1) membership checks
|
| 84 |
# ---------------------------------------------------------------------------
|
| 85 |
-
ALL_PORTFOLIO_NOUNS: frozenset[str] = KNOWN_PROJECTS | KNOWN_TECHNOLOGIES | KNOWN_ORGS
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
|
| 87 |
# Compact context block passed to Gemini when generating a specific not-found
|
| 88 |
# suggestion. One sentence per major entity class — tight token budget.
|
|
@@ -111,7 +166,7 @@ def is_portfolio_relevant(query: str) -> bool:
|
|
| 111 |
tokens = re.findall(r"[a-z0-9]+", query.lower())
|
| 112 |
# Single-token check
|
| 113 |
for token in tokens:
|
| 114 |
-
if token
|
| 115 |
return True
|
| 116 |
# Bigram check — catches "vk live", "text ops", "echo echo"
|
| 117 |
for a, b in zip(tokens, tokens[1:]):
|
|
|
|
| 79 |
"github", "groq", "huggingface", "vercel", "cloudflare", "qdrant cloud",
|
| 80 |
})
|
| 81 |
|
| 82 |
+
# ---------------------------------------------------------------------------
|
| 83 |
+
# Intent nouns that should always route to portfolio retrieval paths
|
| 84 |
+
# (especially resume/CV questions that may not mention named entities).
|
| 85 |
+
# ---------------------------------------------------------------------------
|
| 86 |
+
KNOWN_INTENTS: frozenset[str] = frozenset({
|
| 87 |
+
"work", "experience", "work experience", "career", "employment", "job", "role",
|
| 88 |
+
"internship", "internships", "skills", "skill", "education", "degree", "university",
|
| 89 |
+
"resume", "cv", "background", "certification", "certifications",
|
| 90 |
+
})
|
| 91 |
+
|
| 92 |
# ---------------------------------------------------------------------------
|
| 93 |
# All known portfolio nouns in one flat set for O(1) membership checks
|
| 94 |
# ---------------------------------------------------------------------------
|
| 95 |
+
ALL_PORTFOLIO_NOUNS: frozenset[str] = KNOWN_PROJECTS | KNOWN_TECHNOLOGIES | KNOWN_ORGS | KNOWN_INTENTS
|
| 96 |
+
|
| 97 |
+
# Single-token subset for typo-tolerant matching (e.g. "walk" -> "work").
|
| 98 |
+
_SINGLE_TOKEN_NOUNS: frozenset[str] = frozenset({n for n in ALL_PORTFOLIO_NOUNS if " " not in n})
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
def _is_edit_distance_leq_one(a: str, b: str) -> bool:
|
| 102 |
+
"""Fast check for Levenshtein distance <= 1 (substitute/insert/delete)."""
|
| 103 |
+
if a == b:
|
| 104 |
+
return True
|
| 105 |
+
la, lb = len(a), len(b)
|
| 106 |
+
if abs(la - lb) > 1:
|
| 107 |
+
return False
|
| 108 |
+
|
| 109 |
+
if la == lb:
|
| 110 |
+
mismatches = sum(1 for x, y in zip(a, b) if x != y)
|
| 111 |
+
return mismatches <= 1
|
| 112 |
+
|
| 113 |
+
# Ensure a is shorter for insert/delete logic.
|
| 114 |
+
if la > lb:
|
| 115 |
+
a, b = b, a
|
| 116 |
+
la, lb = lb, la
|
| 117 |
+
|
| 118 |
+
i = j = 0
|
| 119 |
+
mismatch = 0
|
| 120 |
+
while i < la and j < lb:
|
| 121 |
+
if a[i] == b[j]:
|
| 122 |
+
i += 1
|
| 123 |
+
j += 1
|
| 124 |
+
continue
|
| 125 |
+
mismatch += 1
|
| 126 |
+
if mismatch > 1:
|
| 127 |
+
return False
|
| 128 |
+
j += 1
|
| 129 |
+
return True
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
def _token_matches_known_portfolio_noun(token: str) -> bool:
|
| 133 |
+
if token in ALL_PORTFOLIO_NOUNS:
|
| 134 |
+
return True
|
| 135 |
+
if len(token) < 4:
|
| 136 |
+
return False
|
| 137 |
+
for known in _SINGLE_TOKEN_NOUNS:
|
| 138 |
+
if abs(len(token) - len(known)) <= 1 and _is_edit_distance_leq_one(token, known):
|
| 139 |
+
return True
|
| 140 |
+
return False
|
| 141 |
|
| 142 |
# Compact context block passed to Gemini when generating a specific not-found
|
| 143 |
# suggestion. One sentence per major entity class — tight token budget.
|
|
|
|
| 166 |
tokens = re.findall(r"[a-z0-9]+", query.lower())
|
| 167 |
# Single-token check
|
| 168 |
for token in tokens:
|
| 169 |
+
if _token_matches_known_portfolio_noun(token):
|
| 170 |
return True
|
| 171 |
# Bigram check — catches "vk live", "text ops", "echo echo"
|
| 172 |
for a, b in zip(tokens, tokens[1:]):
|
app/pipeline/nodes/retrieve.py
CHANGED
|
@@ -111,6 +111,71 @@ _TYPE_REMAP: dict[str, str] = {
|
|
| 111 |
"resume": "resume", # RC-3: explicit pass-through so resume chunks aren't "unknown"
|
| 112 |
}
|
| 113 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
|
| 115 |
def make_retrieve_node(
|
| 116 |
vector_store: VectorStore, embedder: Embedder, reranker: Reranker
|
|
@@ -125,6 +190,7 @@ def make_retrieve_node(
|
|
| 125 |
# cosine similarity against "PersonaBot RAG pipeline" passages; the rewrite
|
| 126 |
# "What ML projects has Darshan built?" dramatically improves recall.
|
| 127 |
retrieval_query = state.get("decontextualized_query") or query
|
|
|
|
| 128 |
|
| 129 |
# Reuse the topic computed by the guard node — no recomputation needed.
|
| 130 |
topic = state.get("query_topic") or ""
|
|
|
|
| 111 |
"resume": "resume", # RC-3: explicit pass-through so resume chunks aren't "unknown"
|
| 112 |
}
|
| 113 |
|
| 114 |
+
_FOCUS_VOCAB: frozenset[str] = frozenset(
|
| 115 |
+
{
|
| 116 |
+
keyword
|
| 117 |
+
for keys in _FOCUS_KEYWORDS.keys()
|
| 118 |
+
for keyword in keys
|
| 119 |
+
if " " not in keyword
|
| 120 |
+
}
|
| 121 |
+
)
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
def _edit_distance(a: str, b: str) -> int:
|
| 125 |
+
la, lb = len(a), len(b)
|
| 126 |
+
dp = list(range(lb + 1))
|
| 127 |
+
for i in range(1, la + 1):
|
| 128 |
+
prev = dp[0]
|
| 129 |
+
dp[0] = i
|
| 130 |
+
for j in range(1, lb + 1):
|
| 131 |
+
cur = dp[j]
|
| 132 |
+
cost = 0 if a[i - 1] == b[j - 1] else 1
|
| 133 |
+
dp[j] = min(dp[j] + 1, dp[j - 1] + 1, prev + cost)
|
| 134 |
+
prev = cur
|
| 135 |
+
return dp[lb]
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
def _best_focus_replacement(token: str) -> str | None:
|
| 139 |
+
best = None
|
| 140 |
+
best_score = 99
|
| 141 |
+
for candidate in _FOCUS_VOCAB:
|
| 142 |
+
if token[0] != candidate[0]:
|
| 143 |
+
continue
|
| 144 |
+
if abs(len(token) - len(candidate)) > 1:
|
| 145 |
+
continue
|
| 146 |
+
score = _edit_distance(token, candidate)
|
| 147 |
+
if score <= 2 and score < best_score:
|
| 148 |
+
best_score = score
|
| 149 |
+
best = candidate
|
| 150 |
+
return best
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
def _normalise_focus_typos(query: str) -> str:
|
| 154 |
+
"""
|
| 155 |
+
Correct minor STT typos for intent words used by focused retrieval.
|
| 156 |
+
|
| 157 |
+
Example: "walk experience" -> "work experience".
|
| 158 |
+
"""
|
| 159 |
+
tokens = query.lower().split()
|
| 160 |
+
if not tokens:
|
| 161 |
+
return query
|
| 162 |
+
|
| 163 |
+
corrected: list[str] = []
|
| 164 |
+
for token in tokens:
|
| 165 |
+
stripped = token.strip(".,!?;:\"'()[]{}")
|
| 166 |
+
if len(stripped) < 4 or stripped in _FOCUS_VOCAB:
|
| 167 |
+
corrected.append(token)
|
| 168 |
+
continue
|
| 169 |
+
|
| 170 |
+
replacement = _best_focus_replacement(stripped)
|
| 171 |
+
|
| 172 |
+
if replacement:
|
| 173 |
+
corrected.append(token.replace(stripped, replacement))
|
| 174 |
+
else:
|
| 175 |
+
corrected.append(token)
|
| 176 |
+
|
| 177 |
+
return " ".join(corrected)
|
| 178 |
+
|
| 179 |
|
| 180 |
def make_retrieve_node(
|
| 181 |
vector_store: VectorStore, embedder: Embedder, reranker: Reranker
|
|
|
|
| 190 |
# cosine similarity against "PersonaBot RAG pipeline" passages; the rewrite
|
| 191 |
# "What ML projects has Darshan built?" dramatically improves recall.
|
| 192 |
retrieval_query = state.get("decontextualized_query") or query
|
| 193 |
+
retrieval_query = _normalise_focus_typos(retrieval_query)
|
| 194 |
|
| 195 |
# Reuse the topic computed by the guard node — no recomputation needed.
|
| 196 |
topic = state.get("query_topic") or ""
|
tests/test_enumerate_query.py
CHANGED
|
@@ -211,3 +211,9 @@ class TestIsPortfolioRelevant:
|
|
| 211 |
|
| 212 |
def test_empty_string(self):
|
| 213 |
assert is_portfolio_relevant("") is False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 211 |
|
| 212 |
def test_empty_string(self):
|
| 213 |
assert is_portfolio_relevant("") is False
|
| 214 |
+
|
| 215 |
+
def test_resume_intent_keywords_are_relevant(self):
|
| 216 |
+
assert is_portfolio_relevant("tell me about his work experience") is True
|
| 217 |
+
|
| 218 |
+
def test_stt_typo_work_experience_is_still_relevant(self):
|
| 219 |
+
assert is_portfolio_relevant("tell me about his walk experience") is True
|
tests/test_retrieve_query_normalization.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from app.pipeline.nodes.retrieve import _normalise_focus_typos
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
def test_walk_experience_normalises_to_work_experience() -> None:
|
| 5 |
+
assert _normalise_focus_typos("Can you tell me about his walk experience then?") == (
|
| 6 |
+
"can you tell me about his work experience then?"
|
| 7 |
+
)
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def test_non_focus_text_is_not_overwritten() -> None:
|
| 11 |
+
original = "Tell me about widget orchestration internals"
|
| 12 |
+
assert _normalise_focus_typos(original) == original.lower()
|