Spaces:
Running
Running
File size: 2,539 Bytes
5b7955a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 | """Pre-process user queries before retrieval."""
from __future__ import annotations
import re
from rag_engine.utils.logger import get_logger
logger = get_logger(__name__)
_QUESTION_STARTERS = re.compile(
r"^(what|is|are|does|do|can|will|how|when|where|why)\b", re.IGNORECASE
)
_FLOOD_KEYWORDS = {"flood", "water", "storm", "hurricane"}
_FIRE_KEYWORDS = {"fire", "smoke", "explosion", "burn"}
_THEFT_KEYWORDS = {"theft", "stolen", "burglary", "robbery"}
_DEDUCTIBLE_KEYWORDS = {"deductible", "excess", "out-of-pocket"}
_LIMIT_KEYWORDS = {"limit", "maximum", "cap", "ceiling"}
class QueryPreprocessor:
"""Clean and enrich user queries for retrieval."""
# ------------------------------------------------------------------ #
# preprocess
# ------------------------------------------------------------------ #
def preprocess(self, query: str) -> str:
"""Normalise whitespace and append ``?`` to question-like queries."""
original = query
result = query.strip()
# Append "?" for question-like phrases that don't already end with one
if not result.endswith("?") and _QUESTION_STARTERS.match(result):
# Capitalise the first letter of question-like phrases
if result:
result = result[0].upper() + result[1:]
result = result + "?"
# Collapse multiple spaces
result = re.sub(r"\s+", " ", result)
logger.info("Query preprocessed: '%s' → '%s'", original, result)
return result
# ------------------------------------------------------------------ #
# extract_filters
# ------------------------------------------------------------------ #
def extract_filters(self, query: str, policy_id: str | None = None) -> dict:
"""Build a Supabase metadata filter dict from *query* keywords."""
filters: dict = {}
if policy_id is not None:
filters["policy_id"] = policy_id
lower = query.lower()
words = set(lower.split())
if words & _FLOOD_KEYWORDS:
filters["coverage_category"] = "flood"
if words & _FIRE_KEYWORDS:
filters["coverage_category"] = "fire"
if words & _THEFT_KEYWORDS:
filters["coverage_category"] = "theft"
if words & _DEDUCTIBLE_KEYWORDS:
filters["deductible_related"] = True
if words & _LIMIT_KEYWORDS:
filters["limit_related"] = True
logger.info("Extracted filters: %s", filters)
return filters
|