Spaces:

1337XCode
/

personabot-api

Running

personabot-api / app /core /portfolio_context.py

GitHub Actions

Deploy 2e8cff3

c44df3b 9 days ago

7.11 kB

	"""
	backend/app/core/portfolio_context.py

	Known portfolio entities extracted from the TOON context file.

	Two purposes:
	1. Fix 2 Rule 1 — CRAG routing: detect whether a failed query is asking
	about something genuinely in the portfolio. When the first CRAG retry
	also fails, a second retry is allowed for queries that mention known
	entities. This prevents the not-found response from firing on queries
	that should have findings (e.g. "how does textops work?").

	2. Fix 2 Rule 2 — Not-found specific suggestion: the generate node passes
	the TOON entity list to Gemini so it can produce a specific redirect like
	"Try asking about his TextOps Kubernetes setup" rather than the generic
	"ask about his projects".

	Entity list is manually maintained from the TOON context file and must be
	updated whenever refresh_gemini_context.py adds new content.
	Deliberate duplication: the TOON file is runtime state (may be absent in tests);
	this module is compile-time — no file I/O, no latency, no failure mode.
	"""
	from __future__ import annotations

	# ---------------------------------------------------------------------------
	# Known project names (as they appear in the TOON file and corpus)
	# ---------------------------------------------------------------------------
	KNOWN_PROJECTS: frozenset[str] = frozenset({
	"textops", "text ops",
	"echo-echo", "echo echo",
	"localhost",
	"donut-asm", "donut asm", "donut.c", "donut",
	"save-the-planet", "save the planet",
	"sorting-demo", "sorting demo",
	"student-management-system", "student management system",
	"sysphus",
	"personabot", "persona bot",
	})

	# ---------------------------------------------------------------------------
	# Known technologies (canonical forms + common abbreviations)
	# ---------------------------------------------------------------------------
	KNOWN_TECHNOLOGIES: frozenset[str] = frozenset({
	# Languages
	"python", "go", "golang", "java", "javascript", "typescript",
	"assembly", "x86", "sql", "html", "css",
	# Frameworks / libraries
	"fastapi", "react", "node.js", "nodejs", "express", "ejs",
	"langgraph", "langchain", "pydantic",
	# Infra / cloud
	"docker", "kubernetes", "aws", "gcp", "terraform", "ci/cd", "gitlab",
	"github actions", "nginx",
	# ML / AI
	"yolo", "yolov8", "ncnn", "onnx",
	"rag", "llm", "llms", "groq", "gemini", "qdrant",
	"sentence-transformers", "bge", "cross-encoder", "bm25",
	# Networking / P2P
	"webrtc", "kademlia", "tor", "dht", "p2p",
	# Database
	"sqlite", "postgres", "postgresql", "mysql", "mongodb", "orm",
	# Testing
	"junit", "pytest",
	"jwt", "owasp",
	# Monitoring
	"prometheus", "mlflow", "dagshub",
	# Misc
	"microservices", "serverless", "e2ee",
	})

	# ---------------------------------------------------------------------------
	# Known companies / educational institutions
	# ---------------------------------------------------------------------------
	KNOWN_ORGS: frozenset[str] = frozenset({
	# Employment (update from TOON / resume as new roles are indexed)
	"vk live", "vklive",
	# Education
	"university",
	# Platforms / services
	"github", "groq", "huggingface", "vercel", "cloudflare", "qdrant cloud",
	})

	# ---------------------------------------------------------------------------
	# Intent nouns that should always route to portfolio retrieval paths
	# (especially resume/CV questions that may not mention named entities).
	# ---------------------------------------------------------------------------
	KNOWN_INTENTS: frozenset[str] = frozenset({
	"work", "experience", "work experience", "career", "employment", "job", "role",
	"internship", "internships", "skills", "skill", "education", "degree", "university",
	"resume", "cv", "background", "certification", "certifications",
	"tech", "stack", "tech stack", "technology", "technologies",
	"framework", "frameworks", "tool", "tools", "tooling",
	"language", "languages",
	})

	# ---------------------------------------------------------------------------
	# All known portfolio nouns in one flat set for O(1) membership checks
	# ---------------------------------------------------------------------------
	ALL_PORTFOLIO_NOUNS: frozenset[str] = KNOWN_PROJECTS \| KNOWN_TECHNOLOGIES \| KNOWN_ORGS \| KNOWN_INTENTS

	# Single-token subset for typo-tolerant matching (e.g. "walk" -> "work").
	_SINGLE_TOKEN_NOUNS: frozenset[str] = frozenset({n for n in ALL_PORTFOLIO_NOUNS if " " not in n})


	def _is_edit_distance_leq_one(a: str, b: str) -> bool:
	"""Fast check for Levenshtein distance <= 1 (substitute/insert/delete)."""
	if a == b:
	return True
	la, lb = len(a), len(b)
	if abs(la - lb) > 1:
	return False

	if la == lb:
	mismatches = sum(1 for x, y in zip(a, b) if x != y)
	return mismatches <= 1

	# Ensure a is shorter for insert/delete logic.
	if la > lb:
	a, b = b, a
	la, lb = lb, la

	i = j = 0
	mismatch = 0
	while i < la and j < lb:
	if a[i] == b[j]:
	i += 1
	j += 1
	continue
	mismatch += 1
	if mismatch > 1:
	return False
	j += 1
	return True


	def _token_matches_known_portfolio_noun(token: str) -> bool:
	if token in ALL_PORTFOLIO_NOUNS:
	return True
	if len(token) < 4:
	return False
	for known in _SINGLE_TOKEN_NOUNS:
	if abs(len(token) - len(known)) <= 1 and _is_edit_distance_leq_one(token, known):
	return True
	return False

	# Compact context block passed to Gemini when generating a specific not-found
	# suggestion. One sentence per major entity class — tight token budget.
	SUGGESTION_HINT: str = (
	"Darshan's portfolio includes: "
	"projects (TextOps, Echo-Echo, Localhost, Donut-ASM, Sysphus, Save the Planet, Sorting Demo, "
	"Student Management System, PersonaBot); "
	"skills and technologies (Python, Go, FastAPI, LangGraph, RAG, Qdrant, Groq, Docker, Kubernetes, "
	"AWS, WebRTC, Kademlia DHT, YOLOv8, Assembly x86, Java, React, Node.js); "
	"blog posts (60 FPS Object Detection on Android, Prompt Engineering Jailbreaks); "
	"work experience and education (ask about his resume/CV for employer details)."
	)


	def is_portfolio_relevant(query: str) -> bool:
	"""
	Return True when the query mentions at least one known portfolio entity.

	Used by graph routing (Fix 2 Rule 1) to decide whether a second CRAG
	retry is warranted after the first retry also found nothing.

	Token-level check: split on non-alphanumeric, lowercase, check membership.
	~5µs per call on a 20-token query — zero latency impact.
	"""
	import re
	tokens = re.findall(r"[a-z0-9]+", query.lower())
	# Single-token check
	for token in tokens:
	if _token_matches_known_portfolio_noun(token):
	return True
	# Bigram check — catches "vk live", "text ops", "echo echo"
	for a, b in zip(tokens, tokens[1:]):
	if f"{a} {b}" in ALL_PORTFOLIO_NOUNS:
	return True
	return False