Investor-API / utils /PdfUtils.py
ashishbangwal's picture
id error resolved
c926830
import numpy
from PIL import Image
from typing import List, Tuple
import pymupdf
from .ChartClassifier import Classifier
from .HelperFunctions import CountTokens
from .ModelCallingFunctions import image_data_extractor
def extract_image_content(pixmap_list: List[pymupdf.Pixmap], text: str) -> List[str]:
"Takes image path and extract information from it, and return it as text."
# Start Classifier inference session
classifier = Classifier("utils/graph_classifierV2_B.onnx")
img_list = []
for pixmap in pixmap_list:
try:
img_list.append(
Image.frombytes(
mode="RGB", size=(pixmap.width, pixmap.height), data=pixmap.samples
)
)
except Exception as e:
print(e)
graph_image = classifier.classify(img_list)
print(graph_image)
response_list = []
for idx, is_graph in enumerate(graph_image):
if is_graph:
response = image_data_extractor(img=img_list[idx], text=text)
response_list.append(str(response))
return response_list
def ProcessPdf(pdf_content: bytes) -> List[Tuple[str, int]]:
"""
Takes PDF(bytes) and return a list of tuples containing text(including textual and image content)
and page number containing that text.
"""
print("Extract content called ")
pdf_doc = pymupdf.open(stream=pdf_content, filetype="pdf")
pages_content = []
refered_xref = []
for page_number in range(pdf_doc.page_count):
page_content = ""
# extracting text content
page = pdf_doc.load_page(page_number)
text_content = str(page.get_text()).replace("\n", "\t")
page_content += text_content
# extracting image content
image_list = page.get_image_info(xrefs=True)
pixmap_list = []
for img_info in image_list:
xref = img_info["xref"]
if xref not in refered_xref:
# if xref not in refered_xref:
try:
img_pixmap = pymupdf.Pixmap(pdf_doc, xref)
pixmap_list.append(img_pixmap)
refered_xref.append(xref)
except ValueError as e:
print(f"Skipping image with due to error: {e}")
if len(pixmap_list) > 0:
img_content = extract_image_content(
pixmap_list=pixmap_list, text=text_content.replace("\n", "\t")
)
page_content = page_content + "\n\n" + "\n\n".join(img_content)
pages_content.append(page_content)
num_tokens = CountTokens(pages_content)
final_data = []
# Logic to handle case when page content > 512 tokens
for e, n_token in enumerate(num_tokens):
if n_token > 500:
n_parts = numpy.ceil(n_token / 500).astype(int)
len_content = len(pages_content[e])
part_size = len_content // n_parts
start, end = 0, part_size
temp = []
for nth_part in range(n_parts):
temp.append((pages_content[e][start:end], str(e) + "_" + str(nth_part)))
start = end
end = end + part_size
final_data += temp
else:
final_data.append((pages_content[e], str(e)))
pdf_doc.close()
return final_data