Spaces:
Paused
Paused
File size: 4,903 Bytes
b022bee | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 | """Ingestion pipeline to build a page‑level FAISS index with rich metadata.
Features:
- Per page extraction (page_index 0-based, page_label as shown in PDF)
- Optional OCR fallback for blank / low-text pages (scanned PDFs)
- Records include: doc_id, doc_title, page_index, page_label, text,
section_heading (heuristic), span_start/stop (page chars),
has_anchor flags for configured phrases.
- Outputs JSONL + builds FAISS vector store (sentence-transformers/all-MiniLM-L6-v2)
Note: This is a lightweight scaffold; tune heading detection + anchors as needed.
"""
from __future__ import annotations
import os, re, json, uuid
from dataclasses import dataclass, asdict
from typing import List, Dict, Iterable
from pypdf import PdfReader
import pytesseract
from PIL import Image
from io import BytesIO
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.schema import Document
ANCHOR_PHRASES = [
"Specifically these objectives are",
]
HEADING_PATTERN = re.compile(r"^\s*(?:[A-Z][A-Z \-]{3,}|\d+\.[0-9.]*\s+.+)$")
@dataclass
class PageRecord:
doc_id: str
doc_title: str
page_index: int
page_label: str
text: str
section_heading: str
span_start: int
span_stop: int
has_anchors: Dict[str, bool]
source: str # original path
def _extract_page_label(reader, idx: int) -> str:
# Attempt to read logical page label from PDF (if present); fallback to idx+1
try:
return reader.page_labels[idx]
except Exception:
return str(idx + 1)
def _ocr_page(page) -> str:
try:
images = page.images
except Exception:
images = []
texts = []
for img_obj in images:
try:
data = img_obj.data
im = Image.open(BytesIO(data))
txt = pytesseract.image_to_string(im)
if txt.strip():
texts.append(txt)
except Exception:
continue
return "\n".join(texts).strip()
def _heading_from_text(text: str) -> str:
lines = [l.strip() for l in text.splitlines() if l.strip()]
for l in lines[:8]: # inspect first few lines
if HEADING_PATTERN.match(l) and len(l.split()) <= 16:
return l[:120]
return ""
def ingest_pdf(path: str, doc_id: str = None, doc_title: str = None) -> List[PageRecord]:
reader = PdfReader(path)
doc_id = doc_id or uuid.uuid5(uuid.NAMESPACE_URL, path).hex[:12]
doc_title = doc_title or os.path.splitext(os.path.basename(path))[0]
records: List[PageRecord] = []
for i, page in enumerate(reader.pages):
try:
raw = page.extract_text() or ""
except Exception:
raw = ""
if len(raw.strip()) < 20: # fallback to OCR for likely scanned page
raw_ocr = _ocr_page(page)
if len(raw_ocr) > len(raw):
raw = raw_ocr
page_label = _extract_page_label(reader, i)
heading = _heading_from_text(raw)
has_anchors = {a: (a.lower() in raw.lower()) for a in ANCHOR_PHRASES}
rec = PageRecord(
doc_id=doc_id,
doc_title=doc_title,
page_index=i,
page_label=str(page_label),
text=raw,
section_heading=heading,
span_start=0,
span_stop=len(raw),
has_anchors=has_anchors,
source=path,
)
records.append(rec)
return records
def build_vectorstore(records: List[PageRecord], index_dir: str = "faiss_index_new") -> str:
os.makedirs(index_dir, exist_ok=True)
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
docs = [Document(page_content=r.text, metadata={
"doc_id": r.doc_id,
"doc_title": r.doc_title,
"page_index": r.page_index,
"page_label": r.page_label,
"section_heading": r.section_heading,
"span_start": r.span_start,
"span_stop": r.span_stop,
"source": r.source,
**{f"anchor_{k}": v for k, v in r.has_anchors.items()}
}) for r in records]
vs = FAISS.from_documents(docs, embeddings)
vs.save_local(index_dir)
# also write JSONL
with open(os.path.join(index_dir, "pages.jsonl"), "w", encoding="utf-8") as f:
for r in records:
f.write(json.dumps(asdict(r), ensure_ascii=False) + "\n")
return index_dir
if __name__ == "__main__":
import argparse
ap = argparse.ArgumentParser()
ap.add_argument("pdf", help="Path to PDF")
ap.add_argument("--doc-id")
ap.add_argument("--doc-title")
ap.add_argument("--out", default="faiss_index_new")
args = ap.parse_args()
recs = ingest_pdf(args.pdf, doc_id=args.doc_id, doc_title=args.doc_title)
build_vectorstore(recs, args.out)
print(f"Ingested {len(recs)} pages -> {args.out}") |