Spaces:
Running
Running
| import re | |
| from typing import Optional, Dict, List | |
| PAGE_PATTERN = re.compile(r"page\s+(\d+)", re.IGNORECASE) | |
| # crude pattern capturing phrases like "Kenya Energy Policy 2018" or any sequence ending with a 4-digit year | |
| DOC_PHRASE_PATTERN = re.compile(r"([A-Z][A-Za-z]+(?:\s+[A-Z][A-Za-z]+){0,6}\s+20\d{2})") | |
| STOP = {"the","and","of","in","policy","document","national"} | |
| def _doc_tokens(phrase: str) -> List[str]: | |
| return [t.lower() for t in re.findall(r"[A-Za-z0-9]+", phrase) if t.lower() not in STOP and not t.isdigit()] + \ | |
| [y for y in re.findall(r"20\d{2}", phrase)] | |
| def parse_query_constraints(query: str) -> Dict[str, Optional[int]]: | |
| """Extract simple structured constraints from the natural language query. | |
| Currently supports: | |
| - page: "page 17" -> page=17 | |
| Extendable later for document title filtering. | |
| """ | |
| page = None | |
| if query: | |
| m = PAGE_PATTERN.search(query) | |
| if m: | |
| try: | |
| page = int(m.group(1)) | |
| except ValueError: | |
| page = None | |
| doc_tokens: List[str] = [] | |
| if query: | |
| for m in DOC_PHRASE_PATTERN.finditer(query): | |
| doc_tokens = _doc_tokens(m.group(1)) | |
| if doc_tokens: | |
| break | |
| return {"page": page, "doc_tokens": doc_tokens} | |
| def page_matches(meta, target_page: int) -> bool: | |
| """Return True if metadata page/page_label matches the requested page. | |
| Accepts numeric or string page labels; if a range or list is present, match any number equal to target_page. | |
| """ | |
| if target_page is None: | |
| return True | |
| label = meta.get("page_label") or meta.get("page") or "" | |
| if label is None: | |
| return False | |
| # Normalize to string and extract integers present | |
| s = str(label) | |
| nums = re.findall(r"\d+", s) | |
| return any(int(n) == target_page for n in nums) | |
| def doc_matches(meta, tokens: List[str]) -> bool: | |
| if not tokens: | |
| return True | |
| src = (meta.get("source") or meta.get("path") or "").lower() | |
| if not src: | |
| return False | |
| hit = sum(1 for t in tokens if t in src) | |
| # require at least 60% of tokens present | |
| return hit / max(1, len(tokens)) >= 0.6 | |