File size: 2,182 Bytes
b022bee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import re
from typing import Optional, Dict, List

PAGE_PATTERN = re.compile(r"page\s+(\d+)", re.IGNORECASE)
# crude pattern capturing phrases like "Kenya Energy Policy 2018" or any sequence ending with a 4-digit year
DOC_PHRASE_PATTERN = re.compile(r"([A-Z][A-Za-z]+(?:\s+[A-Z][A-Za-z]+){0,6}\s+20\d{2})")
STOP = {"the","and","of","in","policy","document","national"}

def _doc_tokens(phrase: str) -> List[str]:
    return [t.lower() for t in re.findall(r"[A-Za-z0-9]+", phrase) if t.lower() not in STOP and not t.isdigit()] + \
           [y for y in re.findall(r"20\d{2}", phrase)]

def parse_query_constraints(query: str) -> Dict[str, Optional[int]]:
    """Extract simple structured constraints from the natural language query.

    Currently supports:
      - page: "page 17" -> page=17
    Extendable later for document title filtering.
    """
    page = None
    if query:
        m = PAGE_PATTERN.search(query)
        if m:
            try:
                page = int(m.group(1))
            except ValueError:
                page = None
    doc_tokens: List[str] = []
    if query:
        for m in DOC_PHRASE_PATTERN.finditer(query):
            doc_tokens = _doc_tokens(m.group(1))
            if doc_tokens:
                break
    return {"page": page, "doc_tokens": doc_tokens}

def page_matches(meta, target_page: int) -> bool:
    """Return True if metadata page/page_label matches the requested page.
    Accepts numeric or string page labels; if a range or list is present, match any number equal to target_page.
    """
    if target_page is None:
        return True
    label = meta.get("page_label") or meta.get("page") or ""
    if label is None:
        return False
    # Normalize to string and extract integers present
    s = str(label)
    nums = re.findall(r"\d+", s)
    return any(int(n) == target_page for n in nums)

def doc_matches(meta, tokens: List[str]) -> bool:
    if not tokens:
        return True
    src = (meta.get("source") or meta.get("path") or "").lower()
    if not src:
        return False
    hit = sum(1 for t in tokens if t in src)
    # require at least 60% of tokens present
    return hit / max(1, len(tokens)) >= 0.6