policy-analysis / utils /query_constraints.py
kaburia's picture
rewrite
b022bee
import re
from typing import Optional, Dict, List
PAGE_PATTERN = re.compile(r"page\s+(\d+)", re.IGNORECASE)
# crude pattern capturing phrases like "Kenya Energy Policy 2018" or any sequence ending with a 4-digit year
DOC_PHRASE_PATTERN = re.compile(r"([A-Z][A-Za-z]+(?:\s+[A-Z][A-Za-z]+){0,6}\s+20\d{2})")
STOP = {"the","and","of","in","policy","document","national"}
def _doc_tokens(phrase: str) -> List[str]:
return [t.lower() for t in re.findall(r"[A-Za-z0-9]+", phrase) if t.lower() not in STOP and not t.isdigit()] + \
[y for y in re.findall(r"20\d{2}", phrase)]
def parse_query_constraints(query: str) -> Dict[str, Optional[int]]:
"""Extract simple structured constraints from the natural language query.
Currently supports:
- page: "page 17" -> page=17
Extendable later for document title filtering.
"""
page = None
if query:
m = PAGE_PATTERN.search(query)
if m:
try:
page = int(m.group(1))
except ValueError:
page = None
doc_tokens: List[str] = []
if query:
for m in DOC_PHRASE_PATTERN.finditer(query):
doc_tokens = _doc_tokens(m.group(1))
if doc_tokens:
break
return {"page": page, "doc_tokens": doc_tokens}
def page_matches(meta, target_page: int) -> bool:
"""Return True if metadata page/page_label matches the requested page.
Accepts numeric or string page labels; if a range or list is present, match any number equal to target_page.
"""
if target_page is None:
return True
label = meta.get("page_label") or meta.get("page") or ""
if label is None:
return False
# Normalize to string and extract integers present
s = str(label)
nums = re.findall(r"\d+", s)
return any(int(n) == target_page for n in nums)
def doc_matches(meta, tokens: List[str]) -> bool:
if not tokens:
return True
src = (meta.get("source") or meta.get("path") or "").lower()
if not src:
return False
hit = sum(1 for t in tokens if t in src)
# require at least 60% of tokens present
return hit / max(1, len(tokens)) >= 0.6