File size: 4,903 Bytes
b022bee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
"""Ingestion pipeline to build a page‑level FAISS index with rich metadata.

Features:
 - Per page extraction (page_index 0-based, page_label as shown in PDF)
 - Optional OCR fallback for blank / low-text pages (scanned PDFs)
 - Records include: doc_id, doc_title, page_index, page_label, text,
                    section_heading (heuristic), span_start/stop (page chars),
                    has_anchor flags for configured phrases.
 - Outputs JSONL + builds FAISS vector store (sentence-transformers/all-MiniLM-L6-v2)

Note: This is a lightweight scaffold; tune heading detection + anchors as needed.
"""
from __future__ import annotations
import os, re, json, uuid
from dataclasses import dataclass, asdict
from typing import List, Dict, Iterable
from pypdf import PdfReader
import pytesseract
from PIL import Image
from io import BytesIO
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.schema import Document

ANCHOR_PHRASES = [
    "Specifically these objectives are",
]

HEADING_PATTERN = re.compile(r"^\s*(?:[A-Z][A-Z \-]{3,}|\d+\.[0-9.]*\s+.+)$")

@dataclass
class PageRecord:
    doc_id: str
    doc_title: str
    page_index: int
    page_label: str
    text: str
    section_heading: str
    span_start: int
    span_stop: int
    has_anchors: Dict[str, bool]
    source: str  # original path

def _extract_page_label(reader, idx: int) -> str:
    # Attempt to read logical page label from PDF (if present); fallback to idx+1
    try:
        return reader.page_labels[idx]
    except Exception:
        return str(idx + 1)

def _ocr_page(page) -> str:
    try:
        images = page.images
    except Exception:
        images = []
    texts = []
    for img_obj in images:
        try:
            data = img_obj.data
            im = Image.open(BytesIO(data))
            txt = pytesseract.image_to_string(im)
            if txt.strip():
                texts.append(txt)
        except Exception:
            continue
    return "\n".join(texts).strip()

def _heading_from_text(text: str) -> str:
    lines = [l.strip() for l in text.splitlines() if l.strip()]
    for l in lines[:8]:  # inspect first few lines
        if HEADING_PATTERN.match(l) and len(l.split()) <= 16:
            return l[:120]
    return ""

def ingest_pdf(path: str, doc_id: str = None, doc_title: str = None) -> List[PageRecord]:
    reader = PdfReader(path)
    doc_id = doc_id or uuid.uuid5(uuid.NAMESPACE_URL, path).hex[:12]
    doc_title = doc_title or os.path.splitext(os.path.basename(path))[0]
    records: List[PageRecord] = []
    for i, page in enumerate(reader.pages):
        try:
            raw = page.extract_text() or ""
        except Exception:
            raw = ""
        if len(raw.strip()) < 20:  # fallback to OCR for likely scanned page
            raw_ocr = _ocr_page(page)
            if len(raw_ocr) > len(raw):
                raw = raw_ocr
        page_label = _extract_page_label(reader, i)
        heading = _heading_from_text(raw)
        has_anchors = {a: (a.lower() in raw.lower()) for a in ANCHOR_PHRASES}
        rec = PageRecord(
            doc_id=doc_id,
            doc_title=doc_title,
            page_index=i,
            page_label=str(page_label),
            text=raw,
            section_heading=heading,
            span_start=0,
            span_stop=len(raw),
            has_anchors=has_anchors,
            source=path,
        )
        records.append(rec)
    return records

def build_vectorstore(records: List[PageRecord], index_dir: str = "faiss_index_new") -> str:
    os.makedirs(index_dir, exist_ok=True)
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    docs = [Document(page_content=r.text, metadata={
        "doc_id": r.doc_id,
        "doc_title": r.doc_title,
        "page_index": r.page_index,
        "page_label": r.page_label,
        "section_heading": r.section_heading,
        "span_start": r.span_start,
        "span_stop": r.span_stop,
        "source": r.source,
        **{f"anchor_{k}": v for k, v in r.has_anchors.items()}
    }) for r in records]
    vs = FAISS.from_documents(docs, embeddings)
    vs.save_local(index_dir)
    # also write JSONL
    with open(os.path.join(index_dir, "pages.jsonl"), "w", encoding="utf-8") as f:
        for r in records:
            f.write(json.dumps(asdict(r), ensure_ascii=False) + "\n")
    return index_dir

if __name__ == "__main__":
    import argparse
    ap = argparse.ArgumentParser()
    ap.add_argument("pdf", help="Path to PDF")
    ap.add_argument("--doc-id")
    ap.add_argument("--doc-title")
    ap.add_argument("--out", default="faiss_index_new")
    args = ap.parse_args()
    recs = ingest_pdf(args.pdf, doc_id=args.doc_id, doc_title=args.doc_title)
    build_vectorstore(recs, args.out)
    print(f"Ingested {len(recs)} pages -> {args.out}")