File size: 7,112 Bytes
0da0699
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b616cc1
 
 
 
 
 
 
 
c44df3b
 
 
b616cc1
 
0da0699
 
 
b616cc1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0da0699
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b616cc1
0da0699
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
"""
backend/app/core/portfolio_context.py

Known portfolio entities extracted from the TOON context file.

Two purposes:
  1. Fix 2 Rule 1 — CRAG routing: detect whether a failed query is asking
     about something genuinely in the portfolio.  When the first CRAG retry
     also fails, a second retry is allowed for queries that mention known
     entities.  This prevents the not-found response from firing on queries
     that should have findings (e.g. "how does textops work?").

  2. Fix 2 Rule 2 — Not-found specific suggestion: the generate node passes
     the TOON entity list to Gemini so it can produce a specific redirect like
     "Try asking about his TextOps Kubernetes setup" rather than the generic
     "ask about his projects".

Entity list is manually maintained from the TOON context file and must be
updated whenever refresh_gemini_context.py adds new content.
Deliberate duplication: the TOON file is runtime state (may be absent in tests);
this module is compile-time — no file I/O, no latency, no failure mode.
"""
from __future__ import annotations

# ---------------------------------------------------------------------------
# Known project names (as they appear in the TOON file and corpus)
# ---------------------------------------------------------------------------
KNOWN_PROJECTS: frozenset[str] = frozenset({
    "textops", "text ops",
    "echo-echo", "echo echo",
    "localhost",
    "donut-asm", "donut asm", "donut.c", "donut",
    "save-the-planet", "save the planet",
    "sorting-demo", "sorting demo",
    "student-management-system", "student management system",
    "sysphus",
    "personabot", "persona bot",
})

# ---------------------------------------------------------------------------
# Known technologies (canonical forms + common abbreviations)
# ---------------------------------------------------------------------------
KNOWN_TECHNOLOGIES: frozenset[str] = frozenset({
    # Languages
    "python", "go", "golang", "java", "javascript", "typescript",
    "assembly", "x86", "sql", "html", "css",
    # Frameworks / libraries
    "fastapi", "react", "node.js", "nodejs", "express", "ejs",
    "langgraph", "langchain", "pydantic",
    # Infra / cloud
    "docker", "kubernetes", "aws", "gcp", "terraform", "ci/cd", "gitlab",
    "github actions", "nginx",
    # ML / AI
    "yolo", "yolov8", "ncnn", "onnx",
    "rag", "llm", "llms", "groq", "gemini", "qdrant",
    "sentence-transformers", "bge", "cross-encoder", "bm25",
    # Networking / P2P
    "webrtc", "kademlia", "tor", "dht", "p2p",
    # Database
    "sqlite", "postgres", "postgresql", "mysql", "mongodb", "orm",
    # Testing
    "junit", "pytest",
    "jwt", "owasp",
    # Monitoring
    "prometheus", "mlflow", "dagshub",
    # Misc
    "microservices", "serverless", "e2ee",
})

# ---------------------------------------------------------------------------
# Known companies / educational institutions
# ---------------------------------------------------------------------------
KNOWN_ORGS: frozenset[str] = frozenset({
    # Employment (update from TOON / resume as new roles are indexed)
    "vk live", "vklive",
    # Education
    "university",
    # Platforms / services
    "github", "groq", "huggingface", "vercel", "cloudflare", "qdrant cloud",
})

# ---------------------------------------------------------------------------
# Intent nouns that should always route to portfolio retrieval paths
# (especially resume/CV questions that may not mention named entities).
# ---------------------------------------------------------------------------
KNOWN_INTENTS: frozenset[str] = frozenset({
    "work", "experience", "work experience", "career", "employment", "job", "role",
    "internship", "internships", "skills", "skill", "education", "degree", "university",
    "resume", "cv", "background", "certification", "certifications",
    "tech", "stack", "tech stack", "technology", "technologies",
    "framework", "frameworks", "tool", "tools", "tooling",
    "language", "languages",
})

# ---------------------------------------------------------------------------
# All known portfolio nouns in one flat set for O(1) membership checks
# ---------------------------------------------------------------------------
ALL_PORTFOLIO_NOUNS: frozenset[str] = KNOWN_PROJECTS | KNOWN_TECHNOLOGIES | KNOWN_ORGS | KNOWN_INTENTS

# Single-token subset for typo-tolerant matching (e.g. "walk" -> "work").
_SINGLE_TOKEN_NOUNS: frozenset[str] = frozenset({n for n in ALL_PORTFOLIO_NOUNS if " " not in n})


def _is_edit_distance_leq_one(a: str, b: str) -> bool:
    """Fast check for Levenshtein distance <= 1 (substitute/insert/delete)."""
    if a == b:
        return True
    la, lb = len(a), len(b)
    if abs(la - lb) > 1:
        return False

    if la == lb:
        mismatches = sum(1 for x, y in zip(a, b) if x != y)
        return mismatches <= 1

    # Ensure a is shorter for insert/delete logic.
    if la > lb:
        a, b = b, a
        la, lb = lb, la

    i = j = 0
    mismatch = 0
    while i < la and j < lb:
        if a[i] == b[j]:
            i += 1
            j += 1
            continue
        mismatch += 1
        if mismatch > 1:
            return False
        j += 1
    return True


def _token_matches_known_portfolio_noun(token: str) -> bool:
    if token in ALL_PORTFOLIO_NOUNS:
        return True
    if len(token) < 4:
        return False
    for known in _SINGLE_TOKEN_NOUNS:
        if abs(len(token) - len(known)) <= 1 and _is_edit_distance_leq_one(token, known):
            return True
    return False

# Compact context block passed to Gemini when generating a specific not-found
# suggestion.  One sentence per major entity class — tight token budget.
SUGGESTION_HINT: str = (
    "Darshan's portfolio includes: "
    "projects (TextOps, Echo-Echo, Localhost, Donut-ASM, Sysphus, Save the Planet, Sorting Demo, "
    "Student Management System, PersonaBot); "
    "skills and technologies (Python, Go, FastAPI, LangGraph, RAG, Qdrant, Groq, Docker, Kubernetes, "
    "AWS, WebRTC, Kademlia DHT, YOLOv8, Assembly x86, Java, React, Node.js); "
    "blog posts (60 FPS Object Detection on Android, Prompt Engineering Jailbreaks); "
    "work experience and education (ask about his resume/CV for employer details)."
)


def is_portfolio_relevant(query: str) -> bool:
    """
    Return True when the query mentions at least one known portfolio entity.

    Used by graph routing (Fix 2 Rule 1) to decide whether a second CRAG
    retry is warranted after the first retry also found nothing.

    Token-level check: split on non-alphanumeric, lowercase, check membership.
    ~5µs per call on a 20-token query — zero latency impact.
    """
    import re
    tokens = re.findall(r"[a-z0-9]+", query.lower())
    # Single-token check
    for token in tokens:
        if _token_matches_known_portfolio_noun(token):
            return True
    # Bigram check — catches "vk live", "text ops", "echo echo"
    for a, b in zip(tokens, tokens[1:]):
        if f"{a} {b}" in ALL_PORTFOLIO_NOUNS:
            return True
    return False