Spaces:
Running
Running
File size: 3,577 Bytes
e27c97c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 | from langchain_core.documents import Document
from collections import defaultdict
import re
import pdfplumber
import fitz # PyMuPDF
import camelot
import pytesseract
from PIL import Image
import io
# -------------------------------
# STEP 1: EXTRACT RAW CONTENT
# -------------------------------
def raw_document_text(pdf_path: str):
documents = []
with pdfplumber.open(pdf_path) as pdf:
doc_fitz = fitz.open(pdf_path)
for page_index, page in enumerate(pdf.pages, start=1):
# -------- TEXT --------
text = page.extract_text()
if text:
documents.append({
"content": text,
"metadata": {
"page": page_index,
"type": "text"
}
})
# -------- TABLES --------
try:
tables = camelot.read_pdf(
pdf_path,
pages=str(page_index),
flavor="stream"
)
for t_idx, table in enumerate(tables):
table_text = table.df.to_string(index=False)
documents.append({
"content": table_text,
"metadata": {
"page": page_index,
"type": "table",
"ref": f"Table {t_idx + 1}"
}
})
except Exception:
pass
# -------- IMAGES + OCR --------
page_fitz = doc_fitz[page_index - 1]
images = page_fitz.get_images(full=True)
for img_idx, img in enumerate(images):
xref = img[0]
base_image = doc_fitz.extract_image(xref)
image_bytes = base_image["image"]
image = Image.open(io.BytesIO(image_bytes))
ocr_text = pytesseract.image_to_string(image)
if ocr_text.strip():
documents.append({
"content": ocr_text,
"metadata": {
"page": page_index,
"type": "image",
"ref": f"Image {img_idx + 1}"
}
})
return documents
# -------------------------------
# STEP 2: RAW → LANGCHAIN DOCS
# -------------------------------
def to_langchain_documents(raw_docs):
lc_docs = []
for doc in raw_docs:
lc_docs.append(
Document(
page_content=doc["content"],
metadata=doc["metadata"]
)
)
return lc_docs
# -------------------------------
# STEP 3: BUILD INVERTED INDEX
# -------------------------------
def build_inverted_index(lc_docs):
index = defaultdict(set)
for doc_id, doc in enumerate(lc_docs):
words = re.findall(r"\b\w+\b", doc.page_content.lower())
for word in words:
index[word].add(doc_id)
return index
# -------------------------------
# STEP 4: RUN PIPELINE
# -------------------------------
if __name__ == "__main__":
pdf_path = "Report.pdf" # <-- change path
raw_docs = raw_document_text(pdf_path)
lc_docs = to_langchain_documents(raw_docs)
index = build_inverted_index(lc_docs)
print(f"Total LangChain Documents: {len(lc_docs)}")
print(f"Total Indexed Words: {len(index)}")
# Preview index
print(dict(list(index.items())[:20]))
|