"""Ingestion pipeline to build a page‑level FAISS index with rich metadata. Features: - Per page extraction (page_index 0-based, page_label as shown in PDF) - Optional OCR fallback for blank / low-text pages (scanned PDFs) - Records include: doc_id, doc_title, page_index, page_label, text, section_heading (heuristic), span_start/stop (page chars), has_anchor flags for configured phrases. - Outputs JSONL + builds FAISS vector store (sentence-transformers/all-MiniLM-L6-v2) Note: This is a lightweight scaffold; tune heading detection + anchors as needed. """ from __future__ import annotations import os, re, json, uuid from dataclasses import dataclass, asdict from typing import List, Dict, Iterable from pypdf import PdfReader import pytesseract from PIL import Image from io import BytesIO from langchain_community.embeddings import HuggingFaceEmbeddings from langchain_community.vectorstores import FAISS from langchain.schema import Document ANCHOR_PHRASES = [ "Specifically these objectives are", ] HEADING_PATTERN = re.compile(r"^\s*(?:[A-Z][A-Z \-]{3,}|\d+\.[0-9.]*\s+.+)$") @dataclass class PageRecord: doc_id: str doc_title: str page_index: int page_label: str text: str section_heading: str span_start: int span_stop: int has_anchors: Dict[str, bool] source: str # original path def _extract_page_label(reader, idx: int) -> str: # Attempt to read logical page label from PDF (if present); fallback to idx+1 try: return reader.page_labels[idx] except Exception: return str(idx + 1) def _ocr_page(page) -> str: try: images = page.images except Exception: images = [] texts = [] for img_obj in images: try: data = img_obj.data im = Image.open(BytesIO(data)) txt = pytesseract.image_to_string(im) if txt.strip(): texts.append(txt) except Exception: continue return "\n".join(texts).strip() def _heading_from_text(text: str) -> str: lines = [l.strip() for l in text.splitlines() if l.strip()] for l in lines[:8]: # inspect first few lines if HEADING_PATTERN.match(l) and len(l.split()) <= 16: return l[:120] return "" def ingest_pdf(path: str, doc_id: str = None, doc_title: str = None) -> List[PageRecord]: reader = PdfReader(path) doc_id = doc_id or uuid.uuid5(uuid.NAMESPACE_URL, path).hex[:12] doc_title = doc_title or os.path.splitext(os.path.basename(path))[0] records: List[PageRecord] = [] for i, page in enumerate(reader.pages): try: raw = page.extract_text() or "" except Exception: raw = "" if len(raw.strip()) < 20: # fallback to OCR for likely scanned page raw_ocr = _ocr_page(page) if len(raw_ocr) > len(raw): raw = raw_ocr page_label = _extract_page_label(reader, i) heading = _heading_from_text(raw) has_anchors = {a: (a.lower() in raw.lower()) for a in ANCHOR_PHRASES} rec = PageRecord( doc_id=doc_id, doc_title=doc_title, page_index=i, page_label=str(page_label), text=raw, section_heading=heading, span_start=0, span_stop=len(raw), has_anchors=has_anchors, source=path, ) records.append(rec) return records def build_vectorstore(records: List[PageRecord], index_dir: str = "faiss_index_new") -> str: os.makedirs(index_dir, exist_ok=True) embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") docs = [Document(page_content=r.text, metadata={ "doc_id": r.doc_id, "doc_title": r.doc_title, "page_index": r.page_index, "page_label": r.page_label, "section_heading": r.section_heading, "span_start": r.span_start, "span_stop": r.span_stop, "source": r.source, **{f"anchor_{k}": v for k, v in r.has_anchors.items()} }) for r in records] vs = FAISS.from_documents(docs, embeddings) vs.save_local(index_dir) # also write JSONL with open(os.path.join(index_dir, "pages.jsonl"), "w", encoding="utf-8") as f: for r in records: f.write(json.dumps(asdict(r), ensure_ascii=False) + "\n") return index_dir if __name__ == "__main__": import argparse ap = argparse.ArgumentParser() ap.add_argument("pdf", help="Path to PDF") ap.add_argument("--doc-id") ap.add_argument("--doc-title") ap.add_argument("--out", default="faiss_index_new") args = ap.parse_args() recs = ingest_pdf(args.pdf, doc_id=args.doc_id, doc_title=args.doc_title) build_vectorstore(recs, args.out) print(f"Ingested {len(recs)} pages -> {args.out}")