Spaces:
Running
Running
| """ | |
| backend/app/core/topic.py | |
| Extracts a 1β3 word topic label from a natural-language query. | |
| Used by Guard, Retrieve, and any node that surfaces context-specific status | |
| labels ("Checking your question about machine learning", "Searching portfolio | |
| for RAG pipeline") without any LLM call. The extraction is a pure set-lookup | |
| β it adds no measurable latency. | |
| extract_topic("What are Darshan's machine learning projects?") | |
| 'machine learning projects' | |
| extract_topic("Tell me about his background") | |
| 'background' | |
| extract_topic("How does he implement RAG?") | |
| 'implement RAG' | |
| extract_topic("What is") | |
| 'What is' | |
| """ | |
| from __future__ import annotations | |
| import re | |
| # Comprehensive stopword set: prepositions, articles, auxiliary verbs, common | |
| # question words, personal pronouns, demonstratives, and portfolio-query filler. | |
| # Content-bearing words (nouns, adjectives, verbs like "implement", "built") | |
| # are intentionally absent β they ARE the topic. | |
| _STOPWORDS: frozenset[str] = frozenset({ | |
| # Articles | |
| "a", "an", "the", | |
| # Prepositions | |
| "about", "above", "across", "after", "against", "along", "among", | |
| "around", "at", "before", "behind", "below", "beneath", "beside", | |
| "between", "beyond", "by", "during", "except", "for", "from", "in", | |
| "inside", "into", "like", "near", "of", "off", "on", "onto", "out", | |
| "outside", "over", "past", "regarding", "since", "through", | |
| "throughout", "to", "toward", "under", "underneath", "until", "up", | |
| "upon", "with", "within", "without", | |
| # Conjunctions | |
| "and", "but", "or", "nor", "so", "yet", "both", "either", "neither", | |
| # Common auxiliary verbs | |
| "is", "are", "was", "were", "be", "been", "being", | |
| "has", "have", "had", "do", "does", "did", | |
| "will", "would", "could", "should", "may", "might", "can", "shall", | |
| # Question words | |
| "what", "who", "where", "when", "how", "why", "which", | |
| # Personal pronouns | |
| "i", "you", "he", "she", "it", "we", "they", | |
| "me", "him", "her", "us", "them", | |
| "my", "your", "his", "its", "our", "their", | |
| "mine", "yours", "hers", "ours", "theirs", | |
| # Demonstratives | |
| "this", "that", "these", "those", | |
| # Common portfolio-query filler | |
| "tell", "me", "about", "show", "give", "list", "get", "find", | |
| "look", "also", "just", "really", "very", "more", "most", | |
| "some", "any", "other", "another", "same", "such", "own", | |
| "darshan", "chheda", # owner name is not a useful topic word | |
| }) | |
| def extract_topic(query: str) -> str: | |
| """Return a 1β3 word topic phrase extracted from ``query``. | |
| Words matching the stopword set are stripped (case-insensitive). The first | |
| 1β3 remaining words are returned joined by spaces. If the query resolves | |
| to zero content words (all stopwords, or empty), the first two whitespace- | |
| separated tokens of the original query are returned unchanged so the caller | |
| always receives a non-empty string. | |
| """ | |
| tokens = re.findall(r"[a-zA-Z']+", query) | |
| content = [t for t in tokens if t.lower() not in _STOPWORDS and len(t) > 1] | |
| if not content: | |
| # Fallback: keep the first two words of the original query verbatim. | |
| parts = query.strip().split() | |
| return " ".join(parts[:2]) if len(parts) >= 2 else (parts[0] if parts else query) | |
| return " ".join(content[:3]) | |