Spaces:

kaburia
/

policy-analysis

Sleeping

App Files Files Community

policy-analysis / utils /ingest_pdf.py

kaburia

rewrite

b022bee 6 months ago

raw

history blame contribute delete

4.9 kB

	"""Ingestion pipeline to build a page‑level FAISS index with rich metadata.

	Features:
	- Per page extraction (page_index 0-based, page_label as shown in PDF)
	- Optional OCR fallback for blank / low-text pages (scanned PDFs)
	- Records include: doc_id, doc_title, page_index, page_label, text,
	section_heading (heuristic), span_start/stop (page chars),
	has_anchor flags for configured phrases.
	- Outputs JSONL + builds FAISS vector store (sentence-transformers/all-MiniLM-L6-v2)

	Note: This is a lightweight scaffold; tune heading detection + anchors as needed.
	"""
	from __future__ import annotations
	import os, re, json, uuid
	from dataclasses import dataclass, asdict
	from typing import List, Dict, Iterable
	from pypdf import PdfReader
	import pytesseract
	from PIL import Image
	from io import BytesIO
	from langchain_community.embeddings import HuggingFaceEmbeddings
	from langchain_community.vectorstores import FAISS
	from langchain.schema import Document

	ANCHOR_PHRASES = [
	"Specifically these objectives are",
	]

	HEADING_PATTERN = re.compile(r"^\s(?:[A-Z][A-Z \-]{3,}\|\d+\.[0-9.]\s+.+)$")

	@dataclass
	class PageRecord:
	doc_id: str
	doc_title: str
	page_index: int
	page_label: str
	text: str
	section_heading: str
	span_start: int
	span_stop: int
	has_anchors: Dict[str, bool]
	source: str # original path

	def _extract_page_label(reader, idx: int) -> str:
	# Attempt to read logical page label from PDF (if present); fallback to idx+1
	try:
	return reader.page_labels[idx]
	except Exception:
	return str(idx + 1)

	def _ocr_page(page) -> str:
	try:
	images = page.images
	except Exception:
	images = []
	texts = []
	for img_obj in images:
	try:
	data = img_obj.data
	im = Image.open(BytesIO(data))
	txt = pytesseract.image_to_string(im)
	if txt.strip():
	texts.append(txt)
	except Exception:
	continue
	return "\n".join(texts).strip()

	def _heading_from_text(text: str) -> str:
	lines = [l.strip() for l in text.splitlines() if l.strip()]
	for l in lines[:8]: # inspect first few lines
	if HEADING_PATTERN.match(l) and len(l.split()) <= 16:
	return l[:120]
	return ""

	def ingest_pdf(path: str, doc_id: str = None, doc_title: str = None) -> List[PageRecord]:
	reader = PdfReader(path)
	doc_id = doc_id or uuid.uuid5(uuid.NAMESPACE_URL, path).hex[:12]
	doc_title = doc_title or os.path.splitext(os.path.basename(path))[0]
	records: List[PageRecord] = []
	for i, page in enumerate(reader.pages):
	try:
	raw = page.extract_text() or ""
	except Exception:
	raw = ""
	if len(raw.strip()) < 20: # fallback to OCR for likely scanned page
	raw_ocr = _ocr_page(page)
	if len(raw_ocr) > len(raw):
	raw = raw_ocr
	page_label = _extract_page_label(reader, i)
	heading = _heading_from_text(raw)
	has_anchors = {a: (a.lower() in raw.lower()) for a in ANCHOR_PHRASES}
	rec = PageRecord(
	doc_id=doc_id,
	doc_title=doc_title,
	page_index=i,
	page_label=str(page_label),
	text=raw,
	section_heading=heading,
	span_start=0,
	span_stop=len(raw),
	has_anchors=has_anchors,
	source=path,
	)
	records.append(rec)
	return records

	def build_vectorstore(records: List[PageRecord], index_dir: str = "faiss_index_new") -> str:
	os.makedirs(index_dir, exist_ok=True)
	embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
	docs = [Document(page_content=r.text, metadata={
	"doc_id": r.doc_id,
	"doc_title": r.doc_title,
	"page_index": r.page_index,
	"page_label": r.page_label,
	"section_heading": r.section_heading,
	"span_start": r.span_start,
	"span_stop": r.span_stop,
	"source": r.source,
	**{f"anchor_{k}": v for k, v in r.has_anchors.items()}
	}) for r in records]
	vs = FAISS.from_documents(docs, embeddings)
	vs.save_local(index_dir)
	# also write JSONL
	with open(os.path.join(index_dir, "pages.jsonl"), "w", encoding="utf-8") as f:
	for r in records:
	f.write(json.dumps(asdict(r), ensure_ascii=False) + "\n")
	return index_dir

	if __name__ == "__main__":
	import argparse
	ap = argparse.ArgumentParser()
	ap.add_argument("pdf", help="Path to PDF")
	ap.add_argument("--doc-id")
	ap.add_argument("--doc-title")
	ap.add_argument("--out", default="faiss_index_new")
	args = ap.parse_args()
	recs = ingest_pdf(args.pdf, doc_id=args.doc_id, doc_title=args.doc_title)
	build_vectorstore(recs, args.out)
	print(f"Ingested {len(recs)} pages -> {args.out}")