File size: 3,346 Bytes
206ef5f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c926830
 
206ef5f
 
 
 
c926830
206ef5f
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import numpy
from PIL import Image
from typing import List, Tuple

import pymupdf

from .ChartClassifier import Classifier
from .HelperFunctions import CountTokens
from .ModelCallingFunctions import image_data_extractor


def extract_image_content(pixmap_list: List[pymupdf.Pixmap], text: str) -> List[str]:
    "Takes image path and extract information from it, and return it as text."

    # Start Classifier inference session
    classifier = Classifier("utils/graph_classifierV2_B.onnx")

    img_list = []

    for pixmap in pixmap_list:
        try:
            img_list.append(
                Image.frombytes(
                    mode="RGB", size=(pixmap.width, pixmap.height), data=pixmap.samples
                )
            )
        except Exception as e:
            print(e)

    graph_image = classifier.classify(img_list)
    print(graph_image)

    response_list = []

    for idx, is_graph in enumerate(graph_image):
        if is_graph:
            response = image_data_extractor(img=img_list[idx], text=text)
            response_list.append(str(response))

    return response_list


def ProcessPdf(pdf_content: bytes) -> List[Tuple[str, int]]:
    """
    Takes PDF(bytes) and return a list of tuples containing text(including textual and image content)
    and page number containing that text.
    """
    print("Extract content called ")
    pdf_doc = pymupdf.open(stream=pdf_content, filetype="pdf")

    pages_content = []
    refered_xref = []
    for page_number in range(pdf_doc.page_count):
        page_content = ""

        # extracting text content
        page = pdf_doc.load_page(page_number)
        text_content = str(page.get_text()).replace("\n", "\t")
        page_content += text_content

        # extracting image content
        image_list = page.get_image_info(xrefs=True)
        pixmap_list = []
        for img_info in image_list:
            xref = img_info["xref"]
            if xref not in refered_xref:
                # if xref not in refered_xref:
                try:
                    img_pixmap = pymupdf.Pixmap(pdf_doc, xref)
                    pixmap_list.append(img_pixmap)
                    refered_xref.append(xref)
                except ValueError as e:
                    print(f"Skipping image with due to error: {e}")
        if len(pixmap_list) > 0:
            img_content = extract_image_content(
                pixmap_list=pixmap_list, text=text_content.replace("\n", "\t")
            )
            page_content = page_content + "\n\n" + "\n\n".join(img_content)

        pages_content.append(page_content)

    num_tokens = CountTokens(pages_content)

    final_data = []

    # Logic to handle case when page content > 512 tokens
    for e, n_token in enumerate(num_tokens):
        if n_token > 500:
            n_parts = numpy.ceil(n_token / 500).astype(int)
            len_content = len(pages_content[e])
            part_size = len_content // n_parts
            start, end = 0, part_size
            temp = []
            for nth_part in range(n_parts):
                temp.append((pages_content[e][start:end], str(e) + "_" + str(nth_part)))
                start = end
                end = end + part_size
            final_data += temp
        else:
            final_data.append((pages_content[e], str(e)))

    pdf_doc.close()
    return final_data