kaburia commited on
Commit
b022bee
·
1 Parent(s): 4c5d178
utils/coherence_bbscore.py CHANGED
@@ -3,7 +3,7 @@ import math, re, unicodedata
3
  from typing import List, Dict, Any, Optional, Tuple
4
  import numpy as np
5
  import os, re, unicodedata, numpy as np
6
-
7
  try:
8
  from sentence_transformers import SentenceTransformer
9
  except Exception:
@@ -242,14 +242,12 @@ def coherence_assessment_std(
242
  }
243
 
244
  # Get the coherence report
245
- def coherence_report(embedder="BAAI/bge-m3",
246
  input_text=None,
247
  reranked_results=None,
248
  run_zero_shot=True):
249
  embedder = Embedder(embedder) if isinstance(embedder, str) else embedder
250
  if reranked_results is None:
251
- # Import here to avoid circular imports
252
- from utils.retrieve_n_rerank import retrieve_and_rerank
253
  reranked_results = retrieve_and_rerank(input_text)
254
  if not reranked_results:
255
  return []
 
3
  from typing import List, Dict, Any, Optional, Tuple
4
  import numpy as np
5
  import os, re, unicodedata, numpy as np
6
+ from utils.retrieve_n_rerank import retrieve_and_rerank
7
  try:
8
  from sentence_transformers import SentenceTransformer
9
  except Exception:
 
242
  }
243
 
244
  # Get the coherence report
245
+ def coherence_report(embedder="MoritzLaurer/deberta-v3-base-zeroshot-v2.0",
246
  input_text=None,
247
  reranked_results=None,
248
  run_zero_shot=True):
249
  embedder = Embedder(embedder) if isinstance(embedder, str) else embedder
250
  if reranked_results is None:
 
 
251
  reranked_results = retrieve_and_rerank(input_text)
252
  if not reranked_results:
253
  return []
utils/conversation_logging.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, json, time, threading, logging
2
+ from datetime import datetime
3
+ from typing import List, Tuple
4
+
5
+ try:
6
+ import boto3
7
+ from botocore.exceptions import ClientError, NoCredentialsError
8
+ except Exception:
9
+ boto3 = None
10
+ ClientError = NoCredentialsError = Exception
11
+
12
+ LOG_FILE = os.getenv("CONVO_LOG_FILE", "conversation_history.jsonl")
13
+ UPLOAD_ENABLED = os.getenv("SPACES_UPLOAD_CONVO", "true").lower() == "true"
14
+
15
+ SPACES_KEY = os.getenv("SPACES_KEY")
16
+ SPACES_SECRET = os.getenv("SPACES_SECRET")
17
+ SPACES_BUCKET = os.getenv("SPACES_BUCKET")
18
+ SPACES_REGION = os.getenv("SPACES_REGION", "ams3")
19
+
20
+ _lock = threading.Lock()
21
+
22
+ def load_history(max_lines: int = 500) -> List[Tuple[str,str]]:
23
+ if not os.path.exists(LOG_FILE):
24
+ return []
25
+ pairs: List[Tuple[str,str]] = []
26
+ try:
27
+ with open(LOG_FILE, "r", encoding="utf-8") as f:
28
+ for line in f.readlines()[-max_lines:]:
29
+ try:
30
+ obj = json.loads(line)
31
+ if obj.get("role") == "exchange":
32
+ pairs.append((obj.get("user",""), obj.get("assistant","")))
33
+ except json.JSONDecodeError:
34
+ continue
35
+ except Exception as e:
36
+ logging.error(f"Failed to load history: {e}")
37
+ return pairs
38
+
39
+ def _write_line(obj: dict):
40
+ with open(LOG_FILE, "a", encoding="utf-8") as f:
41
+ f.write(json.dumps(obj, ensure_ascii=False) + "\n")
42
+
43
+ def _upload_file():
44
+ if not (UPLOAD_ENABLED and boto3 and SPACES_KEY and SPACES_SECRET and SPACES_BUCKET):
45
+ return
46
+ try:
47
+ session = boto3.session.Session()
48
+ client = session.client(
49
+ 's3',
50
+ region_name=SPACES_REGION,
51
+ endpoint_url=f"https://{SPACES_REGION}.digitaloceanspaces.com",
52
+ aws_access_key_id=SPACES_KEY,
53
+ aws_secret_access_key=SPACES_SECRET,
54
+ )
55
+ object_name = os.getenv("SPACES_CONVO_OBJECT", f"chat-logs/{os.path.basename(LOG_FILE)}")
56
+ client.upload_file(LOG_FILE, SPACES_BUCKET, object_name)
57
+ except (ClientError, NoCredentialsError) as e:
58
+ logging.error(f"Spaces upload failed: {e}")
59
+ except Exception as e:
60
+ logging.error(f"Unexpected upload error: {e}")
61
+
62
+ def log_exchange(user_msg: str, assistant_msg: str, meta: dict = None):
63
+ ts = time.time()
64
+ record = {
65
+ "role": "exchange",
66
+ "timestamp": datetime.utcfromtimestamp(ts).isoformat() + "Z",
67
+ "user": user_msg,
68
+ "assistant": assistant_msg,
69
+ "meta": meta or {}
70
+ }
71
+ with _lock:
72
+ _write_line(record)
73
+ # Upload in background thread to avoid blocking UI
74
+ threading.Thread(target=_upload_file, daemon=True).start()
utils/hybrid_retrieval.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Hybrid retrieval (BM25 + dense) with deterministic filtering + page consolidation."""
2
+ from typing import List, Optional, Dict, Any, Tuple
3
+ from rank_bm25 import BM25Okapi
4
+ import numpy as np
5
+ from utils.encoding_input import encode_text
6
+ from utils.retrieve_n_rerank import rerank_cross_encoder
7
+
8
+ TOK_SPLIT = lambda t: [w.lower() for w in t.split() if w.strip()]
9
+
10
+ class HybridRetriever:
11
+ def __init__(self, vectorstore):
12
+ self.vs = vectorstore
13
+ # Build BM25 corpus from all docs
14
+ self.docs = [self.vs.docstore.search(self.vs.index_to_docstore_id[i]) for i in range(len(self.vs.index_to_docstore_id))]
15
+ corpus_tokens = [TOK_SPLIT(d.page_content) for d in self.docs]
16
+ self.bm25 = BM25Okapi(corpus_tokens)
17
+
18
+ def fetch(self, query: str, k_dense=30, k_bm25=30, filters: Dict[str, Any] = None, rerank_top=12) -> List[Any]:
19
+ filters = filters or {}
20
+ q_emb = encode_text(query)
21
+ # Dense search
22
+ q = np.asarray(q_emb, dtype="float32").reshape(1,-1)
23
+ D, I = self.vs.index.search(q, k_dense)
24
+ dense_docs = [self.docs[i] for i in I[0] if i < len(self.docs)]
25
+ # BM25
26
+ bm_scores = self.bm25.get_scores(TOK_SPLIT(query))
27
+ top_bm_idx = np.argsort(bm_scores)[::-1][:k_bm25]
28
+ bm25_docs = [self.docs[i] for i in top_bm_idx]
29
+ # Union
30
+ uniq = {}
31
+ for d in dense_docs + bm25_docs:
32
+ m = getattr(d, 'metadata', {})
33
+ key = (m.get('source'), m.get('page_label'), m.get('page') )
34
+ if key not in uniq:
35
+ uniq[key] = d
36
+ docs = list(uniq.values())
37
+ # Apply filters
38
+ def ok(d):
39
+ m = getattr(d,'metadata',{})
40
+ for k,v in filters.items():
41
+ if v is None: continue
42
+ if str(m.get(k)) != str(v):
43
+ return False
44
+ return True
45
+ docs = [d for d in docs if ok(d)] if filters else docs
46
+ if not docs:
47
+ return []
48
+ # Rerank
49
+ reranked = rerank_cross_encoder(query, docs, top_m=rerank_top)
50
+ return [d for d,_ in reranked]
51
+
52
+ def consolidate_page(docs: List[Any], target_page: Optional[str]) -> List[Any]:
53
+ if not target_page:
54
+ return docs
55
+ # Merge all docs with same (source,page_label)
56
+ by_key: Dict[Tuple[str,str], List[Any]] = {}
57
+ for d in docs:
58
+ m = getattr(d,'metadata',{})
59
+ key = (m.get('source'), str(m.get('page_label') or m.get('page')))
60
+ by_key.setdefault(key, []).append(d)
61
+ merged = []
62
+ from langchain.schema import Document
63
+ for (src,p), group in by_key.items():
64
+ if p != str(target_page):
65
+ continue
66
+ text = "\n".join(g.page_content for g in group)
67
+ meta = dict(group[0].metadata)
68
+ meta['merged_chunks'] = len(group)
69
+ merged.append(Document(page_content=text, metadata=meta))
70
+ return merged or docs
utils/ingest_pdf.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Ingestion pipeline to build a page‑level FAISS index with rich metadata.
2
+
3
+ Features:
4
+ - Per page extraction (page_index 0-based, page_label as shown in PDF)
5
+ - Optional OCR fallback for blank / low-text pages (scanned PDFs)
6
+ - Records include: doc_id, doc_title, page_index, page_label, text,
7
+ section_heading (heuristic), span_start/stop (page chars),
8
+ has_anchor flags for configured phrases.
9
+ - Outputs JSONL + builds FAISS vector store (sentence-transformers/all-MiniLM-L6-v2)
10
+
11
+ Note: This is a lightweight scaffold; tune heading detection + anchors as needed.
12
+ """
13
+ from __future__ import annotations
14
+ import os, re, json, uuid
15
+ from dataclasses import dataclass, asdict
16
+ from typing import List, Dict, Iterable
17
+ from pypdf import PdfReader
18
+ import pytesseract
19
+ from PIL import Image
20
+ from io import BytesIO
21
+ from langchain_community.embeddings import HuggingFaceEmbeddings
22
+ from langchain_community.vectorstores import FAISS
23
+ from langchain.schema import Document
24
+
25
+ ANCHOR_PHRASES = [
26
+ "Specifically these objectives are",
27
+ ]
28
+
29
+ HEADING_PATTERN = re.compile(r"^\s*(?:[A-Z][A-Z \-]{3,}|\d+\.[0-9.]*\s+.+)$")
30
+
31
+ @dataclass
32
+ class PageRecord:
33
+ doc_id: str
34
+ doc_title: str
35
+ page_index: int
36
+ page_label: str
37
+ text: str
38
+ section_heading: str
39
+ span_start: int
40
+ span_stop: int
41
+ has_anchors: Dict[str, bool]
42
+ source: str # original path
43
+
44
+ def _extract_page_label(reader, idx: int) -> str:
45
+ # Attempt to read logical page label from PDF (if present); fallback to idx+1
46
+ try:
47
+ return reader.page_labels[idx]
48
+ except Exception:
49
+ return str(idx + 1)
50
+
51
+ def _ocr_page(page) -> str:
52
+ try:
53
+ images = page.images
54
+ except Exception:
55
+ images = []
56
+ texts = []
57
+ for img_obj in images:
58
+ try:
59
+ data = img_obj.data
60
+ im = Image.open(BytesIO(data))
61
+ txt = pytesseract.image_to_string(im)
62
+ if txt.strip():
63
+ texts.append(txt)
64
+ except Exception:
65
+ continue
66
+ return "\n".join(texts).strip()
67
+
68
+ def _heading_from_text(text: str) -> str:
69
+ lines = [l.strip() for l in text.splitlines() if l.strip()]
70
+ for l in lines[:8]: # inspect first few lines
71
+ if HEADING_PATTERN.match(l) and len(l.split()) <= 16:
72
+ return l[:120]
73
+ return ""
74
+
75
+ def ingest_pdf(path: str, doc_id: str = None, doc_title: str = None) -> List[PageRecord]:
76
+ reader = PdfReader(path)
77
+ doc_id = doc_id or uuid.uuid5(uuid.NAMESPACE_URL, path).hex[:12]
78
+ doc_title = doc_title or os.path.splitext(os.path.basename(path))[0]
79
+ records: List[PageRecord] = []
80
+ for i, page in enumerate(reader.pages):
81
+ try:
82
+ raw = page.extract_text() or ""
83
+ except Exception:
84
+ raw = ""
85
+ if len(raw.strip()) < 20: # fallback to OCR for likely scanned page
86
+ raw_ocr = _ocr_page(page)
87
+ if len(raw_ocr) > len(raw):
88
+ raw = raw_ocr
89
+ page_label = _extract_page_label(reader, i)
90
+ heading = _heading_from_text(raw)
91
+ has_anchors = {a: (a.lower() in raw.lower()) for a in ANCHOR_PHRASES}
92
+ rec = PageRecord(
93
+ doc_id=doc_id,
94
+ doc_title=doc_title,
95
+ page_index=i,
96
+ page_label=str(page_label),
97
+ text=raw,
98
+ section_heading=heading,
99
+ span_start=0,
100
+ span_stop=len(raw),
101
+ has_anchors=has_anchors,
102
+ source=path,
103
+ )
104
+ records.append(rec)
105
+ return records
106
+
107
+ def build_vectorstore(records: List[PageRecord], index_dir: str = "faiss_index_new") -> str:
108
+ os.makedirs(index_dir, exist_ok=True)
109
+ embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
110
+ docs = [Document(page_content=r.text, metadata={
111
+ "doc_id": r.doc_id,
112
+ "doc_title": r.doc_title,
113
+ "page_index": r.page_index,
114
+ "page_label": r.page_label,
115
+ "section_heading": r.section_heading,
116
+ "span_start": r.span_start,
117
+ "span_stop": r.span_stop,
118
+ "source": r.source,
119
+ **{f"anchor_{k}": v for k, v in r.has_anchors.items()}
120
+ }) for r in records]
121
+ vs = FAISS.from_documents(docs, embeddings)
122
+ vs.save_local(index_dir)
123
+ # also write JSONL
124
+ with open(os.path.join(index_dir, "pages.jsonl"), "w", encoding="utf-8") as f:
125
+ for r in records:
126
+ f.write(json.dumps(asdict(r), ensure_ascii=False) + "\n")
127
+ return index_dir
128
+
129
+ if __name__ == "__main__":
130
+ import argparse
131
+ ap = argparse.ArgumentParser()
132
+ ap.add_argument("pdf", help="Path to PDF")
133
+ ap.add_argument("--doc-id")
134
+ ap.add_argument("--doc-title")
135
+ ap.add_argument("--out", default="faiss_index_new")
136
+ args = ap.parse_args()
137
+ recs = ingest_pdf(args.pdf, doc_id=args.doc_id, doc_title=args.doc_title)
138
+ build_vectorstore(recs, args.out)
139
+ print(f"Ingested {len(recs)} pages -> {args.out}")
utils/model_generation.py CHANGED
@@ -26,37 +26,9 @@ PROMPT_TEMPLATES = {
26
  "The context is already searched, retrieved and reranked when handed to you."
27
 
28
 
29
- ),
30
- "user_template": """
31
- Query: {query}
32
-
33
- Deliverables (use the exact section headers below; omit any section whose input is empty/disabled):
34
- 1) Quoted Policy Excerpts
35
- - Quote the necessary text and append citations like (filename p.X). Group by subtopic.
36
- - Try to meet the user's specification as much as possible where if they only want items from a certain page only give out data from that page or if it is from a certain document please only retrieve just from that document
37
- - Order by page
38
- 2) Sentiment Summary
39
- - Using the Sentiment JSON, explain tone, gaps, penalties, and enforcement clarity in plain English. Do not invent fields that aren't present.
40
- 3) Coherence Assessment
41
- - From the coherence report only provide when ticked: state on-topic vs off-topic; call out which sections were coherent, off-topic, or repeated.
42
-
43
-
44
- Constraints:
45
- - No external knowledge. No speculation. If a user ask is outside the sources, state 'Not found in sources.'
46
- - Use full sentences (no telegraphic fragments).
47
- - Each substantive statement has a citation.
48
-
49
- Topic hint: {topic_hint}
50
-
51
- Sentiment JSON (rolled-up across top docs):
52
- {sentiment_json}
53
-
54
- Coherence report:
55
- {coherence_report}
56
-
57
- Context Sources:
58
- {context_block}
59
- """
60
  },
61
 
62
  "abstractive_summary": {
@@ -183,7 +155,7 @@ def build_context_block(top_docs: List[Dict[str, Any]]) -> str:
183
 
184
  citation = f"{filename}, p. {page_label}"
185
 
186
- blocks.append(f"<<<SOURCE: {citation}>>>\n{_clip(text)}\n</SOURCE>")
187
 
188
  return "\n".join(blocks)
189
 
@@ -195,25 +167,47 @@ def build_messages(
195
  task_mode: str,
196
  sentiment_rollup: Dict[str, List[str]],
197
  coherence_report: str = "",
198
- topic_hint: str = "energy policy"
 
199
  ) -> List[Dict[str, str]]:
200
  template = PROMPT_TEMPLATES.get(task_mode)
201
  if not template:
202
  raise ValueError(f"Unknown task mode: {task_mode}")
203
 
204
  context_block = build_context_block(top_docs)
 
 
205
  sentiment_json = json.dumps(sentiment_rollup or {}, ensure_ascii=False)
206
 
207
- user_prompt = template["user_template"].format(
208
- query=query,
209
- topic_hint=topic_hint,
210
- sentiment_json=sentiment_json,
211
- context_block=context_block,
212
- coherence_report=coherence_report
 
 
 
 
 
 
 
213
  )
 
 
 
 
 
 
 
 
 
 
 
 
214
 
215
  return [
216
- {"role": "system", "content": template["system"]},
217
  {"role": "user", "content": user_prompt}
218
  ]
219
 
 
26
  "The context is already searched, retrieved and reranked when handed to you."
27
 
28
 
29
+ ),
30
+ # dynamic assembly; placeholders kept for backward compatibility but sections may be removed
31
+ "user_template": "DYNAMIC"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  },
33
 
34
  "abstractive_summary": {
 
155
 
156
  citation = f"{filename}, p. {page_label}"
157
 
158
+ blocks.append(f"<<<SOURCE: {citation}>>>\n{_clip(text)}\n</SOURCE>")
159
 
160
  return "\n".join(blocks)
161
 
 
167
  task_mode: str,
168
  sentiment_rollup: Dict[str, List[str]],
169
  coherence_report: str = "",
170
+ topic_hint: str = "energy policy",
171
+ allowlist_meta: Dict[str, Any] = None
172
  ) -> List[Dict[str, str]]:
173
  template = PROMPT_TEMPLATES.get(task_mode)
174
  if not template:
175
  raise ValueError(f"Unknown task mode: {task_mode}")
176
 
177
  context_block = build_context_block(top_docs)
178
+ sentiment_present = bool(sentiment_rollup)
179
+ coherence_present = bool(coherence_report)
180
  sentiment_json = json.dumps(sentiment_rollup or {}, ensure_ascii=False)
181
 
182
+ # Build user prompt dynamically to truly omit absent sections
183
+ parts = [
184
+ f"Query: {query}\n",
185
+ "Deliverables (omit any section whose input is empty/disabled):",
186
+ "1) Quoted Policy Excerpts\n - Quote the necessary text and append citations like (filename p.X). Group by subtopic.\n - Honor any page or document restriction from the query strictly.\n - Order by page",
187
+ ]
188
+ if sentiment_present:
189
+ parts.append("2) Sentiment Summary\n - Using the Sentiment JSON, explain tone, gaps, penalties, and enforcement clarity in plain English. Do not invent fields that aren't present.")
190
+ if coherence_present:
191
+ idx = 3 if sentiment_present else 2
192
+ parts.append(f"{idx}) Coherence Assessment\n - From the coherence report: on-topic vs off-topic; note coherent/off-topic/repeated sections only if present.")
193
+ parts.append(
194
+ "\nConstraints:\n- No external knowledge. No speculation. If a user ask is outside the sources, state 'Not found in sources.'\n- Use full sentences.\n- Each substantive statement has a citation."
195
  )
196
+ parts.append(f"\nTopic hint: {topic_hint}\n")
197
+ if sentiment_present:
198
+ parts.append(f"Sentiment JSON (rolled-up across top docs):\n{sentiment_json}\n")
199
+ if coherence_present:
200
+ parts.append(f"Coherence report:\n{coherence_report}\n")
201
+ guard = ""
202
+ if allowlist_meta:
203
+ doc_id = allowlist_meta.get('doc_id')
204
+ pages = allowlist_meta.get('pages')
205
+ guard = f"[ALLOWLIST_DOCS] doc_id={doc_id}; pages={pages}\nOnly use text from chunks where doc_id={doc_id} and page_label in {pages}. If none present reply exactly: Not found in sources for page {pages} of {doc_id}. Do not use any other documents.\n"
206
+ parts.append(f"{guard}Context Sources:\n{context_block}")
207
+ user_prompt = "\n".join(parts)
208
 
209
  return [
210
+ {"role": "system", "content": template["system"]},
211
  {"role": "user", "content": user_prompt}
212
  ]
213
 
utils/query_constraints.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from typing import Optional, Dict, List
3
+
4
+ PAGE_PATTERN = re.compile(r"page\s+(\d+)", re.IGNORECASE)
5
+ # crude pattern capturing phrases like "Kenya Energy Policy 2018" or any sequence ending with a 4-digit year
6
+ DOC_PHRASE_PATTERN = re.compile(r"([A-Z][A-Za-z]+(?:\s+[A-Z][A-Za-z]+){0,6}\s+20\d{2})")
7
+ STOP = {"the","and","of","in","policy","document","national"}
8
+
9
+ def _doc_tokens(phrase: str) -> List[str]:
10
+ return [t.lower() for t in re.findall(r"[A-Za-z0-9]+", phrase) if t.lower() not in STOP and not t.isdigit()] + \
11
+ [y for y in re.findall(r"20\d{2}", phrase)]
12
+
13
+ def parse_query_constraints(query: str) -> Dict[str, Optional[int]]:
14
+ """Extract simple structured constraints from the natural language query.
15
+
16
+ Currently supports:
17
+ - page: "page 17" -> page=17
18
+ Extendable later for document title filtering.
19
+ """
20
+ page = None
21
+ if query:
22
+ m = PAGE_PATTERN.search(query)
23
+ if m:
24
+ try:
25
+ page = int(m.group(1))
26
+ except ValueError:
27
+ page = None
28
+ doc_tokens: List[str] = []
29
+ if query:
30
+ for m in DOC_PHRASE_PATTERN.finditer(query):
31
+ doc_tokens = _doc_tokens(m.group(1))
32
+ if doc_tokens:
33
+ break
34
+ return {"page": page, "doc_tokens": doc_tokens}
35
+
36
+ def page_matches(meta, target_page: int) -> bool:
37
+ """Return True if metadata page/page_label matches the requested page.
38
+ Accepts numeric or string page labels; if a range or list is present, match any number equal to target_page.
39
+ """
40
+ if target_page is None:
41
+ return True
42
+ label = meta.get("page_label") or meta.get("page") or ""
43
+ if label is None:
44
+ return False
45
+ # Normalize to string and extract integers present
46
+ s = str(label)
47
+ nums = re.findall(r"\d+", s)
48
+ return any(int(n) == target_page for n in nums)
49
+
50
+ def doc_matches(meta, tokens: List[str]) -> bool:
51
+ if not tokens:
52
+ return True
53
+ src = (meta.get("source") or meta.get("path") or "").lower()
54
+ if not src:
55
+ return False
56
+ hit = sum(1 for t in tokens if t in src)
57
+ # require at least 60% of tokens present
58
+ return hit / max(1, len(tokens)) >= 0.6