import re from typing import Optional, Dict, List PAGE_PATTERN = re.compile(r"page\s+(\d+)", re.IGNORECASE) # crude pattern capturing phrases like "Kenya Energy Policy 2018" or any sequence ending with a 4-digit year DOC_PHRASE_PATTERN = re.compile(r"([A-Z][A-Za-z]+(?:\s+[A-Z][A-Za-z]+){0,6}\s+20\d{2})") STOP = {"the","and","of","in","policy","document","national"} def _doc_tokens(phrase: str) -> List[str]: return [t.lower() for t in re.findall(r"[A-Za-z0-9]+", phrase) if t.lower() not in STOP and not t.isdigit()] + \ [y for y in re.findall(r"20\d{2}", phrase)] def parse_query_constraints(query: str) -> Dict[str, Optional[int]]: """Extract simple structured constraints from the natural language query. Currently supports: - page: "page 17" -> page=17 Extendable later for document title filtering. """ page = None if query: m = PAGE_PATTERN.search(query) if m: try: page = int(m.group(1)) except ValueError: page = None doc_tokens: List[str] = [] if query: for m in DOC_PHRASE_PATTERN.finditer(query): doc_tokens = _doc_tokens(m.group(1)) if doc_tokens: break return {"page": page, "doc_tokens": doc_tokens} def page_matches(meta, target_page: int) -> bool: """Return True if metadata page/page_label matches the requested page. Accepts numeric or string page labels; if a range or list is present, match any number equal to target_page. """ if target_page is None: return True label = meta.get("page_label") or meta.get("page") or "" if label is None: return False # Normalize to string and extract integers present s = str(label) nums = re.findall(r"\d+", s) return any(int(n) == target_page for n in nums) def doc_matches(meta, tokens: List[str]) -> bool: if not tokens: return True src = (meta.get("source") or meta.get("path") or "").lower() if not src: return False hit = sum(1 for t in tokens if t in src) # require at least 60% of tokens present return hit / max(1, len(tokens)) >= 0.6