File size: 3,577 Bytes
e27c97c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
from langchain_core.documents import Document
from collections import defaultdict
import re
import pdfplumber
import fitz  # PyMuPDF
import camelot
import pytesseract
from PIL import Image
import io


# -------------------------------
# STEP 1: EXTRACT RAW CONTENT
# -------------------------------
def raw_document_text(pdf_path: str):
    documents = []

    with pdfplumber.open(pdf_path) as pdf:
        doc_fitz = fitz.open(pdf_path)

        for page_index, page in enumerate(pdf.pages, start=1):

            # -------- TEXT --------
            text = page.extract_text()
            if text:
                documents.append({
                    "content": text,
                    "metadata": {
                        "page": page_index,
                        "type": "text"
                    }
                })

            # -------- TABLES --------
            try:
                tables = camelot.read_pdf(
                    pdf_path,
                    pages=str(page_index),
                    flavor="stream"
                )

                for t_idx, table in enumerate(tables):
                    table_text = table.df.to_string(index=False)
                    documents.append({
                        "content": table_text,
                        "metadata": {
                            "page": page_index,
                            "type": "table",
                            "ref": f"Table {t_idx + 1}"
                        }
                    })
            except Exception:
                pass

            # -------- IMAGES + OCR --------
            page_fitz = doc_fitz[page_index - 1]
            images = page_fitz.get_images(full=True)

            for img_idx, img in enumerate(images):
                xref = img[0]
                base_image = doc_fitz.extract_image(xref)
                image_bytes = base_image["image"]

                image = Image.open(io.BytesIO(image_bytes))
                ocr_text = pytesseract.image_to_string(image)

                if ocr_text.strip():
                    documents.append({
                        "content": ocr_text,
                        "metadata": {
                            "page": page_index,
                            "type": "image",
                            "ref": f"Image {img_idx + 1}"
                        }
                    })

    return documents


# -------------------------------
# STEP 2: RAW → LANGCHAIN DOCS
# -------------------------------
def to_langchain_documents(raw_docs):
    lc_docs = []
    for doc in raw_docs:
        lc_docs.append(
            Document(
                page_content=doc["content"],
                metadata=doc["metadata"]
            )
        )
    return lc_docs


# -------------------------------
# STEP 3: BUILD INVERTED INDEX
# -------------------------------
def build_inverted_index(lc_docs):
    index = defaultdict(set)

    for doc_id, doc in enumerate(lc_docs):
        words = re.findall(r"\b\w+\b", doc.page_content.lower())

        for word in words:
            index[word].add(doc_id)

    return index


# -------------------------------
# STEP 4: RUN PIPELINE
# -------------------------------
if __name__ == "__main__":
    pdf_path = "Report.pdf"  # <-- change path

    raw_docs = raw_document_text(pdf_path)
    lc_docs = to_langchain_documents(raw_docs)
    index = build_inverted_index(lc_docs)

    print(f"Total LangChain Documents: {len(lc_docs)}")
    print(f"Total Indexed Words: {len(index)}")

    # Preview index
    print(dict(list(index.items())[:20]))