File size: 2,071 Bytes
e27c97c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import pdfplumber
import fitz
import camelot
import pytesseract
from PIL import Image
import io


# Raw Documents
def raw_document_text(pdf_path: str):
    documents = []

    # Open PDF
    with pdfplumber.open(pdf_path) as pdf:
        doc_fitz = fitz.open(pdf_path)

        for page_index, page in enumerate(pdf.pages, start=1):

            # TEXT
            text = page.extract_text()
            if text:
                documents.append({
                    "content": text,
                    "metadata": {
                        "page": page_index,
                        "type": "text"
                    }
                })

            
            # TABLES
            tables = camelot.read_pdf(
                pdf_path,
                pages=str(page_index),
                flavor="stream"
            )

            for t_idx, table in enumerate(tables):
                table_text = table.df.to_string(index=False)
                documents.append({
                    "content": table_text,
                    "metadata": {
                        "page": page_index,
                        "type": "table",
                        "ref": f"Table {t_idx + 1}"
                    }
                })

            
            # IMAGES + OCR
            page_fitz = doc_fitz[page_index - 1]
            images = page_fitz.get_images(full=True)

            for img_idx, img in enumerate(images):
                xref = img[0]
                base_image = doc_fitz.extract_image(xref)
                image_bytes = base_image["image"]

                image = Image.open(io.BytesIO(image_bytes))
                ocr_text = pytesseract.image_to_string(image)

                if ocr_text.strip():
                    documents.append({
                        "content": ocr_text,
                        "metadata": {
                            "page": page_index,
                            "type": "image",
                            "ref": f"Image {img_idx + 1}"
                        }
                    })

    return documents