Spaces:

1337XCode
/

personabot-api

Running

App Files Files Community

GitHub Actions commited on Mar 24

Commit

b616cc1

1 Parent(s): 1f61582

Deploy 73a273d

Browse files

Files changed (4) hide show

app/core/portfolio_context.py +57 -2
app/pipeline/nodes/retrieve.py +66 -0
tests/test_enumerate_query.py +6 -0
tests/test_retrieve_query_normalization.py +12 -0

app/core/portfolio_context.py CHANGED Viewed

@@ -79,10 +79,65 @@ KNOWN_ORGS: frozenset[str] = frozenset({
     "github", "groq", "huggingface", "vercel", "cloudflare", "qdrant cloud",
 })
 # ---------------------------------------------------------------------------
 # All known portfolio nouns in one flat set for O(1) membership checks
 # ---------------------------------------------------------------------------
-ALL_PORTFOLIO_NOUNS: frozenset[str] = KNOWN_PROJECTS | KNOWN_TECHNOLOGIES | KNOWN_ORGS
 # Compact context block passed to Gemini when generating a specific not-found
 # suggestion.  One sentence per major entity class — tight token budget.
@@ -111,7 +166,7 @@ def is_portfolio_relevant(query: str) -> bool:
     tokens = re.findall(r"[a-z0-9]+", query.lower())
     # Single-token check
     for token in tokens:
-        if token in ALL_PORTFOLIO_NOUNS:
             return True
     # Bigram check — catches "vk live", "text ops", "echo echo"
     for a, b in zip(tokens, tokens[1:]):

     "github", "groq", "huggingface", "vercel", "cloudflare", "qdrant cloud",
 })
+# ---------------------------------------------------------------------------
+# Intent nouns that should always route to portfolio retrieval paths
+# (especially resume/CV questions that may not mention named entities).
+# ---------------------------------------------------------------------------
+KNOWN_INTENTS: frozenset[str] = frozenset({
+    "work", "experience", "work experience", "career", "employment", "job", "role",
+    "internship", "internships", "skills", "skill", "education", "degree", "university",
+    "resume", "cv", "background", "certification", "certifications",
+})
 # ---------------------------------------------------------------------------
 # All known portfolio nouns in one flat set for O(1) membership checks
 # ---------------------------------------------------------------------------
+ALL_PORTFOLIO_NOUNS: frozenset[str] = KNOWN_PROJECTS | KNOWN_TECHNOLOGIES | KNOWN_ORGS | KNOWN_INTENTS
+# Single-token subset for typo-tolerant matching (e.g. "walk" -> "work").
+_SINGLE_TOKEN_NOUNS: frozenset[str] = frozenset({n for n in ALL_PORTFOLIO_NOUNS if " " not in n})
+def _is_edit_distance_leq_one(a: str, b: str) -> bool:
+    """Fast check for Levenshtein distance <= 1 (substitute/insert/delete)."""
+    if a == b:
+        return True
+    la, lb = len(a), len(b)
+    if abs(la - lb) > 1:
+        return False
+    if la == lb:
+        mismatches = sum(1 for x, y in zip(a, b) if x != y)
+        return mismatches <= 1
+    # Ensure a is shorter for insert/delete logic.
+    if la > lb:
+        a, b = b, a
+        la, lb = lb, la
+    i = j = 0
+    mismatch = 0
+    while i < la and j < lb:
+        if a[i] == b[j]:
+            i += 1
+            j += 1
+            continue
+        mismatch += 1
+        if mismatch > 1:
+            return False
+        j += 1
+    return True
+def _token_matches_known_portfolio_noun(token: str) -> bool:
+    if token in ALL_PORTFOLIO_NOUNS:
+        return True
+    if len(token) < 4:
+        return False
+    for known in _SINGLE_TOKEN_NOUNS:
+        if abs(len(token) - len(known)) <= 1 and _is_edit_distance_leq_one(token, known):
+            return True
+    return False
 # Compact context block passed to Gemini when generating a specific not-found
 # suggestion.  One sentence per major entity class — tight token budget.
     tokens = re.findall(r"[a-z0-9]+", query.lower())
     # Single-token check
     for token in tokens:
+        if _token_matches_known_portfolio_noun(token):
             return True
     # Bigram check — catches "vk live", "text ops", "echo echo"
     for a, b in zip(tokens, tokens[1:]):

app/pipeline/nodes/retrieve.py CHANGED Viewed

@@ -111,6 +111,71 @@ _TYPE_REMAP: dict[str, str] = {
     "resume": "resume",          # RC-3: explicit pass-through so resume chunks aren't "unknown"
 }
 def make_retrieve_node(
     vector_store: VectorStore, embedder: Embedder, reranker: Reranker
@@ -125,6 +190,7 @@ def make_retrieve_node(
         # cosine similarity against "PersonaBot RAG pipeline" passages; the rewrite
         # "What ML projects has Darshan built?" dramatically improves recall.
         retrieval_query = state.get("decontextualized_query") or query
         # Reuse the topic computed by the guard node — no recomputation needed.
         topic = state.get("query_topic") or ""

     "resume": "resume",          # RC-3: explicit pass-through so resume chunks aren't "unknown"
 }
+_FOCUS_VOCAB: frozenset[str] = frozenset(
+    {
+        keyword
+        for keys in _FOCUS_KEYWORDS.keys()
+        for keyword in keys
+        if " " not in keyword
+    }
+)
+def _edit_distance(a: str, b: str) -> int:
+    la, lb = len(a), len(b)
+    dp = list(range(lb + 1))
+    for i in range(1, la + 1):
+        prev = dp[0]
+        dp[0] = i
+        for j in range(1, lb + 1):
+            cur = dp[j]
+            cost = 0 if a[i - 1] == b[j - 1] else 1
+            dp[j] = min(dp[j] + 1, dp[j - 1] + 1, prev + cost)
+            prev = cur
+    return dp[lb]
+def _best_focus_replacement(token: str) -> str | None:
+    best = None
+    best_score = 99
+    for candidate in _FOCUS_VOCAB:
+        if token[0] != candidate[0]:
+            continue
+        if abs(len(token) - len(candidate)) > 1:
+            continue
+        score = _edit_distance(token, candidate)
+        if score <= 2 and score < best_score:
+            best_score = score
+            best = candidate
+    return best
+def _normalise_focus_typos(query: str) -> str:
+    """
+    Correct minor STT typos for intent words used by focused retrieval.
+    Example: "walk experience" -> "work experience".
+    """
+    tokens = query.lower().split()
+    if not tokens:
+        return query
+    corrected: list[str] = []
+    for token in tokens:
+        stripped = token.strip(".,!?;:\"'()[]{}")
+        if len(stripped) < 4 or stripped in _FOCUS_VOCAB:
+            corrected.append(token)
+            continue
+        replacement = _best_focus_replacement(stripped)
+        if replacement:
+            corrected.append(token.replace(stripped, replacement))
+        else:
+            corrected.append(token)
+    return " ".join(corrected)
 def make_retrieve_node(
     vector_store: VectorStore, embedder: Embedder, reranker: Reranker
         # cosine similarity against "PersonaBot RAG pipeline" passages; the rewrite
         # "What ML projects has Darshan built?" dramatically improves recall.
         retrieval_query = state.get("decontextualized_query") or query
+        retrieval_query = _normalise_focus_typos(retrieval_query)
         # Reuse the topic computed by the guard node — no recomputation needed.
         topic = state.get("query_topic") or ""

tests/test_enumerate_query.py CHANGED Viewed

@@ -211,3 +211,9 @@ class TestIsPortfolioRelevant:
     def test_empty_string(self):
         assert is_portfolio_relevant("") is False

     def test_empty_string(self):
         assert is_portfolio_relevant("") is False
+    def test_resume_intent_keywords_are_relevant(self):
+        assert is_portfolio_relevant("tell me about his work experience") is True
+    def test_stt_typo_work_experience_is_still_relevant(self):
+        assert is_portfolio_relevant("tell me about his walk experience") is True

tests/test_retrieve_query_normalization.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from app.pipeline.nodes.retrieve import _normalise_focus_typos
+def test_walk_experience_normalises_to_work_experience() -> None:
+    assert _normalise_focus_typos("Can you tell me about his walk experience then?") == (
+        "can you tell me about his work experience then?"
+    )
+def test_non_focus_text_is_not_overwritten() -> None:
+    original = "Tell me about widget orchestration internals"
+    assert _normalise_focus_typos(original) == original.lower()