Spaces:
Running
Running
File size: 7,112 Bytes
0da0699 b616cc1 c44df3b b616cc1 0da0699 b616cc1 0da0699 b616cc1 0da0699 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 | """
backend/app/core/portfolio_context.py
Known portfolio entities extracted from the TOON context file.
Two purposes:
1. Fix 2 Rule 1 — CRAG routing: detect whether a failed query is asking
about something genuinely in the portfolio. When the first CRAG retry
also fails, a second retry is allowed for queries that mention known
entities. This prevents the not-found response from firing on queries
that should have findings (e.g. "how does textops work?").
2. Fix 2 Rule 2 — Not-found specific suggestion: the generate node passes
the TOON entity list to Gemini so it can produce a specific redirect like
"Try asking about his TextOps Kubernetes setup" rather than the generic
"ask about his projects".
Entity list is manually maintained from the TOON context file and must be
updated whenever refresh_gemini_context.py adds new content.
Deliberate duplication: the TOON file is runtime state (may be absent in tests);
this module is compile-time — no file I/O, no latency, no failure mode.
"""
from __future__ import annotations
# ---------------------------------------------------------------------------
# Known project names (as they appear in the TOON file and corpus)
# ---------------------------------------------------------------------------
KNOWN_PROJECTS: frozenset[str] = frozenset({
"textops", "text ops",
"echo-echo", "echo echo",
"localhost",
"donut-asm", "donut asm", "donut.c", "donut",
"save-the-planet", "save the planet",
"sorting-demo", "sorting demo",
"student-management-system", "student management system",
"sysphus",
"personabot", "persona bot",
})
# ---------------------------------------------------------------------------
# Known technologies (canonical forms + common abbreviations)
# ---------------------------------------------------------------------------
KNOWN_TECHNOLOGIES: frozenset[str] = frozenset({
# Languages
"python", "go", "golang", "java", "javascript", "typescript",
"assembly", "x86", "sql", "html", "css",
# Frameworks / libraries
"fastapi", "react", "node.js", "nodejs", "express", "ejs",
"langgraph", "langchain", "pydantic",
# Infra / cloud
"docker", "kubernetes", "aws", "gcp", "terraform", "ci/cd", "gitlab",
"github actions", "nginx",
# ML / AI
"yolo", "yolov8", "ncnn", "onnx",
"rag", "llm", "llms", "groq", "gemini", "qdrant",
"sentence-transformers", "bge", "cross-encoder", "bm25",
# Networking / P2P
"webrtc", "kademlia", "tor", "dht", "p2p",
# Database
"sqlite", "postgres", "postgresql", "mysql", "mongodb", "orm",
# Testing
"junit", "pytest",
"jwt", "owasp",
# Monitoring
"prometheus", "mlflow", "dagshub",
# Misc
"microservices", "serverless", "e2ee",
})
# ---------------------------------------------------------------------------
# Known companies / educational institutions
# ---------------------------------------------------------------------------
KNOWN_ORGS: frozenset[str] = frozenset({
# Employment (update from TOON / resume as new roles are indexed)
"vk live", "vklive",
# Education
"university",
# Platforms / services
"github", "groq", "huggingface", "vercel", "cloudflare", "qdrant cloud",
})
# ---------------------------------------------------------------------------
# Intent nouns that should always route to portfolio retrieval paths
# (especially resume/CV questions that may not mention named entities).
# ---------------------------------------------------------------------------
KNOWN_INTENTS: frozenset[str] = frozenset({
"work", "experience", "work experience", "career", "employment", "job", "role",
"internship", "internships", "skills", "skill", "education", "degree", "university",
"resume", "cv", "background", "certification", "certifications",
"tech", "stack", "tech stack", "technology", "technologies",
"framework", "frameworks", "tool", "tools", "tooling",
"language", "languages",
})
# ---------------------------------------------------------------------------
# All known portfolio nouns in one flat set for O(1) membership checks
# ---------------------------------------------------------------------------
ALL_PORTFOLIO_NOUNS: frozenset[str] = KNOWN_PROJECTS | KNOWN_TECHNOLOGIES | KNOWN_ORGS | KNOWN_INTENTS
# Single-token subset for typo-tolerant matching (e.g. "walk" -> "work").
_SINGLE_TOKEN_NOUNS: frozenset[str] = frozenset({n for n in ALL_PORTFOLIO_NOUNS if " " not in n})
def _is_edit_distance_leq_one(a: str, b: str) -> bool:
"""Fast check for Levenshtein distance <= 1 (substitute/insert/delete)."""
if a == b:
return True
la, lb = len(a), len(b)
if abs(la - lb) > 1:
return False
if la == lb:
mismatches = sum(1 for x, y in zip(a, b) if x != y)
return mismatches <= 1
# Ensure a is shorter for insert/delete logic.
if la > lb:
a, b = b, a
la, lb = lb, la
i = j = 0
mismatch = 0
while i < la and j < lb:
if a[i] == b[j]:
i += 1
j += 1
continue
mismatch += 1
if mismatch > 1:
return False
j += 1
return True
def _token_matches_known_portfolio_noun(token: str) -> bool:
if token in ALL_PORTFOLIO_NOUNS:
return True
if len(token) < 4:
return False
for known in _SINGLE_TOKEN_NOUNS:
if abs(len(token) - len(known)) <= 1 and _is_edit_distance_leq_one(token, known):
return True
return False
# Compact context block passed to Gemini when generating a specific not-found
# suggestion. One sentence per major entity class — tight token budget.
SUGGESTION_HINT: str = (
"Darshan's portfolio includes: "
"projects (TextOps, Echo-Echo, Localhost, Donut-ASM, Sysphus, Save the Planet, Sorting Demo, "
"Student Management System, PersonaBot); "
"skills and technologies (Python, Go, FastAPI, LangGraph, RAG, Qdrant, Groq, Docker, Kubernetes, "
"AWS, WebRTC, Kademlia DHT, YOLOv8, Assembly x86, Java, React, Node.js); "
"blog posts (60 FPS Object Detection on Android, Prompt Engineering Jailbreaks); "
"work experience and education (ask about his resume/CV for employer details)."
)
def is_portfolio_relevant(query: str) -> bool:
"""
Return True when the query mentions at least one known portfolio entity.
Used by graph routing (Fix 2 Rule 1) to decide whether a second CRAG
retry is warranted after the first retry also found nothing.
Token-level check: split on non-alphanumeric, lowercase, check membership.
~5µs per call on a 20-token query — zero latency impact.
"""
import re
tokens = re.findall(r"[a-z0-9]+", query.lower())
# Single-token check
for token in tokens:
if _token_matches_known_portfolio_noun(token):
return True
# Bigram check — catches "vk live", "text ops", "echo echo"
for a, b in zip(tokens, tokens[1:]):
if f"{a} {b}" in ALL_PORTFOLIO_NOUNS:
return True
return False
|