File size: 2,539 Bytes
5b7955a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
"""Pre-process user queries before retrieval."""

from __future__ import annotations

import re

from rag_engine.utils.logger import get_logger

logger = get_logger(__name__)

_QUESTION_STARTERS = re.compile(
    r"^(what|is|are|does|do|can|will|how|when|where|why)\b", re.IGNORECASE
)

_FLOOD_KEYWORDS = {"flood", "water", "storm", "hurricane"}
_FIRE_KEYWORDS = {"fire", "smoke", "explosion", "burn"}
_THEFT_KEYWORDS = {"theft", "stolen", "burglary", "robbery"}
_DEDUCTIBLE_KEYWORDS = {"deductible", "excess", "out-of-pocket"}
_LIMIT_KEYWORDS = {"limit", "maximum", "cap", "ceiling"}


class QueryPreprocessor:
    """Clean and enrich user queries for retrieval."""

    # ------------------------------------------------------------------ #
    #  preprocess
    # ------------------------------------------------------------------ #
    def preprocess(self, query: str) -> str:
        """Normalise whitespace and append ``?`` to question-like queries."""
        original = query
        result = query.strip()

        # Append "?" for question-like phrases that don't already end with one
        if not result.endswith("?") and _QUESTION_STARTERS.match(result):
            # Capitalise the first letter of question-like phrases
            if result:
                result = result[0].upper() + result[1:]
            result = result + "?"

        # Collapse multiple spaces
        result = re.sub(r"\s+", " ", result)

        logger.info("Query preprocessed: '%s' → '%s'", original, result)
        return result

    # ------------------------------------------------------------------ #
    #  extract_filters
    # ------------------------------------------------------------------ #
    def extract_filters(self, query: str, policy_id: str | None = None) -> dict:
        """Build a Supabase metadata filter dict from *query* keywords."""
        filters: dict = {}

        if policy_id is not None:
            filters["policy_id"] = policy_id

        lower = query.lower()
        words = set(lower.split())

        if words & _FLOOD_KEYWORDS:
            filters["coverage_category"] = "flood"
        if words & _FIRE_KEYWORDS:
            filters["coverage_category"] = "fire"
        if words & _THEFT_KEYWORDS:
            filters["coverage_category"] = "theft"
        if words & _DEDUCTIBLE_KEYWORDS:
            filters["deductible_related"] = True
        if words & _LIMIT_KEYWORDS:
            filters["limit_related"] = True

        logger.info("Extracted filters: %s", filters)
        return filters