Spaces:
Running
Running
| """ | |
| backend/app/core/portfolio_context.py | |
| Known portfolio entities extracted from the TOON context file. | |
| Two purposes: | |
| 1. Fix 2 Rule 1 — CRAG routing: detect whether a failed query is asking | |
| about something genuinely in the portfolio. When the first CRAG retry | |
| also fails, a second retry is allowed for queries that mention known | |
| entities. This prevents the not-found response from firing on queries | |
| that should have findings (e.g. "how does textops work?"). | |
| 2. Fix 2 Rule 2 — Not-found specific suggestion: the generate node passes | |
| the TOON entity list to Gemini so it can produce a specific redirect like | |
| "Try asking about his TextOps Kubernetes setup" rather than the generic | |
| "ask about his projects". | |
| Entity list is manually maintained from the TOON context file and must be | |
| updated whenever refresh_gemini_context.py adds new content. | |
| Deliberate duplication: the TOON file is runtime state (may be absent in tests); | |
| this module is compile-time — no file I/O, no latency, no failure mode. | |
| """ | |
| from __future__ import annotations | |
| # --------------------------------------------------------------------------- | |
| # Known project names (as they appear in the TOON file and corpus) | |
| # --------------------------------------------------------------------------- | |
| KNOWN_PROJECTS: frozenset[str] = frozenset({ | |
| "textops", "text ops", | |
| "echo-echo", "echo echo", | |
| "localhost", | |
| "donut-asm", "donut asm", "donut.c", "donut", | |
| "save-the-planet", "save the planet", | |
| "sorting-demo", "sorting demo", | |
| "student-management-system", "student management system", | |
| "sysphus", | |
| "personabot", "persona bot", | |
| }) | |
| # --------------------------------------------------------------------------- | |
| # Known technologies (canonical forms + common abbreviations) | |
| # --------------------------------------------------------------------------- | |
| KNOWN_TECHNOLOGIES: frozenset[str] = frozenset({ | |
| # Languages | |
| "python", "go", "golang", "java", "javascript", "typescript", | |
| "assembly", "x86", "sql", "html", "css", | |
| # Frameworks / libraries | |
| "fastapi", "react", "node.js", "nodejs", "express", "ejs", | |
| "langgraph", "langchain", "pydantic", | |
| # Infra / cloud | |
| "docker", "kubernetes", "aws", "gcp", "terraform", "ci/cd", "gitlab", | |
| "github actions", "nginx", | |
| # ML / AI | |
| "yolo", "yolov8", "ncnn", "onnx", | |
| "rag", "llm", "llms", "groq", "gemini", "qdrant", | |
| "sentence-transformers", "bge", "cross-encoder", "bm25", | |
| # Networking / P2P | |
| "webrtc", "kademlia", "tor", "dht", "p2p", | |
| # Database | |
| "sqlite", "postgres", "postgresql", "mysql", "mongodb", "orm", | |
| # Testing | |
| "junit", "pytest", | |
| "jwt", "owasp", | |
| # Monitoring | |
| "prometheus", "mlflow", "dagshub", | |
| # Misc | |
| "microservices", "serverless", "e2ee", | |
| }) | |
| # --------------------------------------------------------------------------- | |
| # Known companies / educational institutions | |
| # --------------------------------------------------------------------------- | |
| KNOWN_ORGS: frozenset[str] = frozenset({ | |
| # Employment (update from TOON / resume as new roles are indexed) | |
| "vk live", "vklive", | |
| # Education | |
| "university", | |
| # Platforms / services | |
| "github", "groq", "huggingface", "vercel", "cloudflare", "qdrant cloud", | |
| }) | |
| # --------------------------------------------------------------------------- | |
| # Intent nouns that should always route to portfolio retrieval paths | |
| # (especially resume/CV questions that may not mention named entities). | |
| # --------------------------------------------------------------------------- | |
| KNOWN_INTENTS: frozenset[str] = frozenset({ | |
| "work", "experience", "work experience", "career", "employment", "job", "role", | |
| "internship", "internships", "skills", "skill", "education", "degree", "university", | |
| "resume", "cv", "background", "certification", "certifications", | |
| "tech", "stack", "tech stack", "technology", "technologies", | |
| "framework", "frameworks", "tool", "tools", "tooling", | |
| "language", "languages", | |
| }) | |
| # --------------------------------------------------------------------------- | |
| # All known portfolio nouns in one flat set for O(1) membership checks | |
| # --------------------------------------------------------------------------- | |
| ALL_PORTFOLIO_NOUNS: frozenset[str] = KNOWN_PROJECTS | KNOWN_TECHNOLOGIES | KNOWN_ORGS | KNOWN_INTENTS | |
| # Single-token subset for typo-tolerant matching (e.g. "walk" -> "work"). | |
| _SINGLE_TOKEN_NOUNS: frozenset[str] = frozenset({n for n in ALL_PORTFOLIO_NOUNS if " " not in n}) | |
| def _is_edit_distance_leq_one(a: str, b: str) -> bool: | |
| """Fast check for Levenshtein distance <= 1 (substitute/insert/delete).""" | |
| if a == b: | |
| return True | |
| la, lb = len(a), len(b) | |
| if abs(la - lb) > 1: | |
| return False | |
| if la == lb: | |
| mismatches = sum(1 for x, y in zip(a, b) if x != y) | |
| return mismatches <= 1 | |
| # Ensure a is shorter for insert/delete logic. | |
| if la > lb: | |
| a, b = b, a | |
| la, lb = lb, la | |
| i = j = 0 | |
| mismatch = 0 | |
| while i < la and j < lb: | |
| if a[i] == b[j]: | |
| i += 1 | |
| j += 1 | |
| continue | |
| mismatch += 1 | |
| if mismatch > 1: | |
| return False | |
| j += 1 | |
| return True | |
| def _token_matches_known_portfolio_noun(token: str) -> bool: | |
| if token in ALL_PORTFOLIO_NOUNS: | |
| return True | |
| if len(token) < 4: | |
| return False | |
| for known in _SINGLE_TOKEN_NOUNS: | |
| if abs(len(token) - len(known)) <= 1 and _is_edit_distance_leq_one(token, known): | |
| return True | |
| return False | |
| # Compact context block passed to Gemini when generating a specific not-found | |
| # suggestion. One sentence per major entity class — tight token budget. | |
| SUGGESTION_HINT: str = ( | |
| "Darshan's portfolio includes: " | |
| "projects (TextOps, Echo-Echo, Localhost, Donut-ASM, Sysphus, Save the Planet, Sorting Demo, " | |
| "Student Management System, PersonaBot); " | |
| "skills and technologies (Python, Go, FastAPI, LangGraph, RAG, Qdrant, Groq, Docker, Kubernetes, " | |
| "AWS, WebRTC, Kademlia DHT, YOLOv8, Assembly x86, Java, React, Node.js); " | |
| "blog posts (60 FPS Object Detection on Android, Prompt Engineering Jailbreaks); " | |
| "work experience and education (ask about his resume/CV for employer details)." | |
| ) | |
| def is_portfolio_relevant(query: str) -> bool: | |
| """ | |
| Return True when the query mentions at least one known portfolio entity. | |
| Used by graph routing (Fix 2 Rule 1) to decide whether a second CRAG | |
| retry is warranted after the first retry also found nothing. | |
| Token-level check: split on non-alphanumeric, lowercase, check membership. | |
| ~5µs per call on a 20-token query — zero latency impact. | |
| """ | |
| import re | |
| tokens = re.findall(r"[a-z0-9]+", query.lower()) | |
| # Single-token check | |
| for token in tokens: | |
| if _token_matches_known_portfolio_noun(token): | |
| return True | |
| # Bigram check — catches "vk live", "text ops", "echo echo" | |
| for a, b in zip(tokens, tokens[1:]): | |
| if f"{a} {b}" in ALL_PORTFOLIO_NOUNS: | |
| return True | |
| return False | |