""" backend/app/core/portfolio_context.py Known portfolio entities extracted from the TOON context file. Two purposes: 1. Fix 2 Rule 1 — CRAG routing: detect whether a failed query is asking about something genuinely in the portfolio. When the first CRAG retry also fails, a second retry is allowed for queries that mention known entities. This prevents the not-found response from firing on queries that should have findings (e.g. "how does textops work?"). 2. Fix 2 Rule 2 — Not-found specific suggestion: the generate node passes the TOON entity list to Gemini so it can produce a specific redirect like "Try asking about his TextOps Kubernetes setup" rather than the generic "ask about his projects". Entity list is manually maintained from the TOON context file and must be updated whenever refresh_gemini_context.py adds new content. Deliberate duplication: the TOON file is runtime state (may be absent in tests); this module is compile-time — no file I/O, no latency, no failure mode. """ from __future__ import annotations # --------------------------------------------------------------------------- # Known project names (as they appear in the TOON file and corpus) # --------------------------------------------------------------------------- KNOWN_PROJECTS: frozenset[str] = frozenset({ "textops", "text ops", "echo-echo", "echo echo", "localhost", "donut-asm", "donut asm", "donut.c", "donut", "save-the-planet", "save the planet", "sorting-demo", "sorting demo", "student-management-system", "student management system", "sysphus", "personabot", "persona bot", }) # --------------------------------------------------------------------------- # Known technologies (canonical forms + common abbreviations) # --------------------------------------------------------------------------- KNOWN_TECHNOLOGIES: frozenset[str] = frozenset({ # Languages "python", "go", "golang", "java", "javascript", "typescript", "assembly", "x86", "sql", "html", "css", # Frameworks / libraries "fastapi", "react", "node.js", "nodejs", "express", "ejs", "langgraph", "langchain", "pydantic", # Infra / cloud "docker", "kubernetes", "aws", "gcp", "terraform", "ci/cd", "gitlab", "github actions", "nginx", # ML / AI "yolo", "yolov8", "ncnn", "onnx", "rag", "llm", "llms", "groq", "gemini", "qdrant", "sentence-transformers", "bge", "cross-encoder", "bm25", # Networking / P2P "webrtc", "kademlia", "tor", "dht", "p2p", # Database "sqlite", "postgres", "postgresql", "mysql", "mongodb", "orm", # Testing "junit", "pytest", "jwt", "owasp", # Monitoring "prometheus", "mlflow", "dagshub", # Misc "microservices", "serverless", "e2ee", }) # --------------------------------------------------------------------------- # Known companies / educational institutions # --------------------------------------------------------------------------- KNOWN_ORGS: frozenset[str] = frozenset({ # Employment (update from TOON / resume as new roles are indexed) "vk live", "vklive", # Education "university", # Platforms / services "github", "groq", "huggingface", "vercel", "cloudflare", "qdrant cloud", }) # --------------------------------------------------------------------------- # Intent nouns that should always route to portfolio retrieval paths # (especially resume/CV questions that may not mention named entities). # --------------------------------------------------------------------------- KNOWN_INTENTS: frozenset[str] = frozenset({ "work", "experience", "work experience", "career", "employment", "job", "role", "internship", "internships", "skills", "skill", "education", "degree", "university", "resume", "cv", "background", "certification", "certifications", "tech", "stack", "tech stack", "technology", "technologies", "framework", "frameworks", "tool", "tools", "tooling", "language", "languages", }) # --------------------------------------------------------------------------- # All known portfolio nouns in one flat set for O(1) membership checks # --------------------------------------------------------------------------- ALL_PORTFOLIO_NOUNS: frozenset[str] = KNOWN_PROJECTS | KNOWN_TECHNOLOGIES | KNOWN_ORGS | KNOWN_INTENTS # Single-token subset for typo-tolerant matching (e.g. "walk" -> "work"). _SINGLE_TOKEN_NOUNS: frozenset[str] = frozenset({n for n in ALL_PORTFOLIO_NOUNS if " " not in n}) def _is_edit_distance_leq_one(a: str, b: str) -> bool: """Fast check for Levenshtein distance <= 1 (substitute/insert/delete).""" if a == b: return True la, lb = len(a), len(b) if abs(la - lb) > 1: return False if la == lb: mismatches = sum(1 for x, y in zip(a, b) if x != y) return mismatches <= 1 # Ensure a is shorter for insert/delete logic. if la > lb: a, b = b, a la, lb = lb, la i = j = 0 mismatch = 0 while i < la and j < lb: if a[i] == b[j]: i += 1 j += 1 continue mismatch += 1 if mismatch > 1: return False j += 1 return True def _token_matches_known_portfolio_noun(token: str) -> bool: if token in ALL_PORTFOLIO_NOUNS: return True if len(token) < 4: return False for known in _SINGLE_TOKEN_NOUNS: if abs(len(token) - len(known)) <= 1 and _is_edit_distance_leq_one(token, known): return True return False # Compact context block passed to Gemini when generating a specific not-found # suggestion. One sentence per major entity class — tight token budget. SUGGESTION_HINT: str = ( "Darshan's portfolio includes: " "projects (TextOps, Echo-Echo, Localhost, Donut-ASM, Sysphus, Save the Planet, Sorting Demo, " "Student Management System, PersonaBot); " "skills and technologies (Python, Go, FastAPI, LangGraph, RAG, Qdrant, Groq, Docker, Kubernetes, " "AWS, WebRTC, Kademlia DHT, YOLOv8, Assembly x86, Java, React, Node.js); " "blog posts (60 FPS Object Detection on Android, Prompt Engineering Jailbreaks); " "work experience and education (ask about his resume/CV for employer details)." ) def is_portfolio_relevant(query: str) -> bool: """ Return True when the query mentions at least one known portfolio entity. Used by graph routing (Fix 2 Rule 1) to decide whether a second CRAG retry is warranted after the first retry also found nothing. Token-level check: split on non-alphanumeric, lowercase, check membership. ~5µs per call on a 20-token query — zero latency impact. """ import re tokens = re.findall(r"[a-z0-9]+", query.lower()) # Single-token check for token in tokens: if _token_matches_known_portfolio_noun(token): return True # Bigram check — catches "vk live", "text ops", "echo echo" for a, b in zip(tokens, tokens[1:]): if f"{a} {b}" in ALL_PORTFOLIO_NOUNS: return True return False