Spaces:

1337XCode
/

personabot-api

Running

File size: 7,112 Bytes

"""
backend/app/core/portfolio_context.py

Known portfolio entities extracted from the TOON context file.

Two purposes:
  1. Fix 2 Rule 1 — CRAG routing: detect whether a failed query is asking
     about something genuinely in the portfolio.  When the first CRAG retry
     also fails, a second retry is allowed for queries that mention known
     entities.  This prevents the not-found response from firing on queries
     that should have findings (e.g. "how does textops work?").

  2. Fix 2 Rule 2 — Not-found specific suggestion: the generate node passes
     the TOON entity list to Gemini so it can produce a specific redirect like
     "Try asking about his TextOps Kubernetes setup" rather than the generic
     "ask about his projects".

Entity list is manually maintained from the TOON context file and must be
updated whenever refresh_gemini_context.py adds new content.
Deliberate duplication: the TOON file is runtime state (may be absent in tests);
this module is compile-time — no file I/O, no latency, no failure mode.
"""
from __future__ import annotations

# ---------------------------------------------------------------------------
# Known project names (as they appear in the TOON file and corpus)
# ---------------------------------------------------------------------------
KNOWN_PROJECTS: frozenset[str] = frozenset({
    "textops", "text ops",
    "echo-echo", "echo echo",
    "localhost",
    "donut-asm", "donut asm", "donut.c", "donut",
    "save-the-planet", "save the planet",
    "sorting-demo", "sorting demo",
    "student-management-system", "student management system",
    "sysphus",
    "personabot", "persona bot",
})

# ---------------------------------------------------------------------------
# Known technologies (canonical forms + common abbreviations)
# ---------------------------------------------------------------------------
KNOWN_TECHNOLOGIES: frozenset[str] = frozenset({
    # Languages
    "python", "go", "golang", "java", "javascript", "typescript",
    "assembly", "x86", "sql", "html", "css",
    # Frameworks / libraries
    "fastapi", "react", "node.js", "nodejs", "express", "ejs",
    "langgraph", "langchain", "pydantic",
    # Infra / cloud
    "docker", "kubernetes", "aws", "gcp", "terraform", "ci/cd", "gitlab",
    "github actions", "nginx",
    # ML / AI
    "yolo", "yolov8", "ncnn", "onnx",
    "rag", "llm", "llms", "groq", "gemini", "qdrant",
    "sentence-transformers", "bge", "cross-encoder", "bm25",
    # Networking / P2P
    "webrtc", "kademlia", "tor", "dht", "p2p",
    # Database
    "sqlite", "postgres", "postgresql", "mysql", "mongodb", "orm",
    # Testing
    "junit", "pytest",
    "jwt", "owasp",
    # Monitoring
    "prometheus", "mlflow", "dagshub",
    # Misc
    "microservices", "serverless", "e2ee",
})

# ---------------------------------------------------------------------------
# Known companies / educational institutions
# ---------------------------------------------------------------------------
KNOWN_ORGS: frozenset[str] = frozenset({
    # Employment (update from TOON / resume as new roles are indexed)
    "vk live", "vklive",
    # Education
    "university",
    # Platforms / services
    "github", "groq", "huggingface", "vercel", "cloudflare", "qdrant cloud",
})

# ---------------------------------------------------------------------------
# Intent nouns that should always route to portfolio retrieval paths
# (especially resume/CV questions that may not mention named entities).
# ---------------------------------------------------------------------------
KNOWN_INTENTS: frozenset[str] = frozenset({
    "work", "experience", "work experience", "career", "employment", "job", "role",
    "internship", "internships", "skills", "skill", "education", "degree", "university",
    "resume", "cv", "background", "certification", "certifications",
    "tech", "stack", "tech stack", "technology", "technologies",
    "framework", "frameworks", "tool", "tools", "tooling",
    "language", "languages",
})

# ---------------------------------------------------------------------------
# All known portfolio nouns in one flat set for O(1) membership checks
# ---------------------------------------------------------------------------
ALL_PORTFOLIO_NOUNS: frozenset[str] = KNOWN_PROJECTS | KNOWN_TECHNOLOGIES | KNOWN_ORGS | KNOWN_INTENTS

# Single-token subset for typo-tolerant matching (e.g. "walk" -> "work").
_SINGLE_TOKEN_NOUNS: frozenset[str] = frozenset({n for n in ALL_PORTFOLIO_NOUNS if " " not in n})


def _is_edit_distance_leq_one(a: str, b: str) -> bool:
    """Fast check for Levenshtein distance <= 1 (substitute/insert/delete)."""
    if a == b:
        return True
    la, lb = len(a), len(b)
    if abs(la - lb) > 1:
        return False

    if la == lb:
        mismatches = sum(1 for x, y in zip(a, b) if x != y)
        return mismatches <= 1

    # Ensure a is shorter for insert/delete logic.
    if la > lb:
        a, b = b, a
        la, lb = lb, la

    i = j = 0
    mismatch = 0
    while i < la and j < lb:
        if a[i] == b[j]:
            i += 1
            j += 1
            continue
        mismatch += 1
        if mismatch > 1:
            return False
        j += 1
    return True


def _token_matches_known_portfolio_noun(token: str) -> bool:
    if token in ALL_PORTFOLIO_NOUNS:
        return True
    if len(token) < 4:
        return False
    for known in _SINGLE_TOKEN_NOUNS:
        if abs(len(token) - len(known)) <= 1 and _is_edit_distance_leq_one(token, known):
            return True
    return False

# Compact context block passed to Gemini when generating a specific not-found
# suggestion.  One sentence per major entity class — tight token budget.
SUGGESTION_HINT: str = (
    "Darshan's portfolio includes: "
    "projects (TextOps, Echo-Echo, Localhost, Donut-ASM, Sysphus, Save the Planet, Sorting Demo, "
    "Student Management System, PersonaBot); "
    "skills and technologies (Python, Go, FastAPI, LangGraph, RAG, Qdrant, Groq, Docker, Kubernetes, "
    "AWS, WebRTC, Kademlia DHT, YOLOv8, Assembly x86, Java, React, Node.js); "
    "blog posts (60 FPS Object Detection on Android, Prompt Engineering Jailbreaks); "
    "work experience and education (ask about his resume/CV for employer details)."
)


def is_portfolio_relevant(query: str) -> bool:
    """
    Return True when the query mentions at least one known portfolio entity.

    Used by graph routing (Fix 2 Rule 1) to decide whether a second CRAG
    retry is warranted after the first retry also found nothing.

    Token-level check: split on non-alphanumeric, lowercase, check membership.
    ~5µs per call on a 20-token query — zero latency impact.
    """
    import re
    tokens = re.findall(r"[a-z0-9]+", query.lower())
    # Single-token check
    for token in tokens:
        if _token_matches_known_portfolio_noun(token):
            return True
    # Bigram check — catches "vk live", "text ops", "echo echo"
    for a, b in zip(tokens, tokens[1:]):
        if f"{a} {b}" in ALL_PORTFOLIO_NOUNS:
            return True
    return False