Spaces:
Running
Running
File size: 2,071 Bytes
e27c97c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 | import pdfplumber
import fitz
import camelot
import pytesseract
from PIL import Image
import io
# Raw Documents
def raw_document_text(pdf_path: str):
documents = []
# Open PDF
with pdfplumber.open(pdf_path) as pdf:
doc_fitz = fitz.open(pdf_path)
for page_index, page in enumerate(pdf.pages, start=1):
# TEXT
text = page.extract_text()
if text:
documents.append({
"content": text,
"metadata": {
"page": page_index,
"type": "text"
}
})
# TABLES
tables = camelot.read_pdf(
pdf_path,
pages=str(page_index),
flavor="stream"
)
for t_idx, table in enumerate(tables):
table_text = table.df.to_string(index=False)
documents.append({
"content": table_text,
"metadata": {
"page": page_index,
"type": "table",
"ref": f"Table {t_idx + 1}"
}
})
# IMAGES + OCR
page_fitz = doc_fitz[page_index - 1]
images = page_fitz.get_images(full=True)
for img_idx, img in enumerate(images):
xref = img[0]
base_image = doc_fitz.extract_image(xref)
image_bytes = base_image["image"]
image = Image.open(io.BytesIO(image_bytes))
ocr_text = pytesseract.image_to_string(image)
if ocr_text.strip():
documents.append({
"content": ocr_text,
"metadata": {
"page": page_index,
"type": "image",
"ref": f"Image {img_idx + 1}"
}
})
return documents
|