import numpy from PIL import Image from typing import List, Tuple import pymupdf from .ChartClassifier import Classifier from .HelperFunctions import CountTokens from .ModelCallingFunctions import image_data_extractor def extract_image_content(pixmap_list: List[pymupdf.Pixmap], text: str) -> List[str]: "Takes image path and extract information from it, and return it as text." # Start Classifier inference session classifier = Classifier("utils/graph_classifierV2_B.onnx") img_list = [] for pixmap in pixmap_list: try: img_list.append( Image.frombytes( mode="RGB", size=(pixmap.width, pixmap.height), data=pixmap.samples ) ) except Exception as e: print(e) graph_image = classifier.classify(img_list) print(graph_image) response_list = [] for idx, is_graph in enumerate(graph_image): if is_graph: response = image_data_extractor(img=img_list[idx], text=text) response_list.append(str(response)) return response_list def ProcessPdf(pdf_content: bytes) -> List[Tuple[str, int]]: """ Takes PDF(bytes) and return a list of tuples containing text(including textual and image content) and page number containing that text. """ print("Extract content called ") pdf_doc = pymupdf.open(stream=pdf_content, filetype="pdf") pages_content = [] refered_xref = [] for page_number in range(pdf_doc.page_count): page_content = "" # extracting text content page = pdf_doc.load_page(page_number) text_content = str(page.get_text()).replace("\n", "\t") page_content += text_content # extracting image content image_list = page.get_image_info(xrefs=True) pixmap_list = [] for img_info in image_list: xref = img_info["xref"] if xref not in refered_xref: # if xref not in refered_xref: try: img_pixmap = pymupdf.Pixmap(pdf_doc, xref) pixmap_list.append(img_pixmap) refered_xref.append(xref) except ValueError as e: print(f"Skipping image with due to error: {e}") if len(pixmap_list) > 0: img_content = extract_image_content( pixmap_list=pixmap_list, text=text_content.replace("\n", "\t") ) page_content = page_content + "\n\n" + "\n\n".join(img_content) pages_content.append(page_content) num_tokens = CountTokens(pages_content) final_data = [] # Logic to handle case when page content > 512 tokens for e, n_token in enumerate(num_tokens): if n_token > 500: n_parts = numpy.ceil(n_token / 500).astype(int) len_content = len(pages_content[e]) part_size = len_content // n_parts start, end = 0, part_size temp = [] for nth_part in range(n_parts): temp.append((pages_content[e][start:end], str(e) + "_" + str(nth_part))) start = end end = end + part_size final_data += temp else: final_data.append((pages_content[e], str(e))) pdf_doc.close() return final_data