Spaces:
Build error
Build error
| import numpy | |
| from PIL import Image | |
| from typing import List, Tuple | |
| import pymupdf | |
| from .ChartClassifier import Classifier | |
| from .HelperFunctions import CountTokens | |
| from .ModelCallingFunctions import image_data_extractor | |
| def extract_image_content(pixmap_list: List[pymupdf.Pixmap], text: str) -> List[str]: | |
| "Takes image path and extract information from it, and return it as text." | |
| # Start Classifier inference session | |
| classifier = Classifier("utils/graph_classifierV2_B.onnx") | |
| img_list = [] | |
| for pixmap in pixmap_list: | |
| try: | |
| img_list.append( | |
| Image.frombytes( | |
| mode="RGB", size=(pixmap.width, pixmap.height), data=pixmap.samples | |
| ) | |
| ) | |
| except Exception as e: | |
| print(e) | |
| graph_image = classifier.classify(img_list) | |
| print(graph_image) | |
| response_list = [] | |
| for idx, is_graph in enumerate(graph_image): | |
| if is_graph: | |
| response = image_data_extractor(img=img_list[idx], text=text) | |
| response_list.append(str(response)) | |
| return response_list | |
| def ProcessPdf(pdf_content: bytes) -> List[Tuple[str, int]]: | |
| """ | |
| Takes PDF(bytes) and return a list of tuples containing text(including textual and image content) | |
| and page number containing that text. | |
| """ | |
| print("Extract content called ") | |
| pdf_doc = pymupdf.open(stream=pdf_content, filetype="pdf") | |
| pages_content = [] | |
| refered_xref = [] | |
| for page_number in range(pdf_doc.page_count): | |
| page_content = "" | |
| # extracting text content | |
| page = pdf_doc.load_page(page_number) | |
| text_content = str(page.get_text()).replace("\n", "\t") | |
| page_content += text_content | |
| # extracting image content | |
| image_list = page.get_image_info(xrefs=True) | |
| pixmap_list = [] | |
| for img_info in image_list: | |
| xref = img_info["xref"] | |
| if xref not in refered_xref: | |
| # if xref not in refered_xref: | |
| try: | |
| img_pixmap = pymupdf.Pixmap(pdf_doc, xref) | |
| pixmap_list.append(img_pixmap) | |
| refered_xref.append(xref) | |
| except ValueError as e: | |
| print(f"Skipping image with due to error: {e}") | |
| if len(pixmap_list) > 0: | |
| img_content = extract_image_content( | |
| pixmap_list=pixmap_list, text=text_content.replace("\n", "\t") | |
| ) | |
| page_content = page_content + "\n\n" + "\n\n".join(img_content) | |
| pages_content.append(page_content) | |
| num_tokens = CountTokens(pages_content) | |
| final_data = [] | |
| # Logic to handle case when page content > 512 tokens | |
| for e, n_token in enumerate(num_tokens): | |
| if n_token > 500: | |
| n_parts = numpy.ceil(n_token / 500).astype(int) | |
| len_content = len(pages_content[e]) | |
| part_size = len_content // n_parts | |
| start, end = 0, part_size | |
| temp = [] | |
| for nth_part in range(n_parts): | |
| temp.append((pages_content[e][start:end], str(e) + "_" + str(nth_part))) | |
| start = end | |
| end = end + part_size | |
| final_data += temp | |
| else: | |
| final_data.append((pages_content[e], str(e))) | |
| pdf_doc.close() | |
| return final_data | |