Spaces:
Build error
Build error
| # Copyright (c) 2022, Lawrence Livermore National Security, LLC. | |
| # All rights reserved. | |
| # See the top-level LICENSE and NOTICE files for details. | |
| # LLNL-CODE-838964 | |
| # SPDX-License-Identifier: Apache-2.0-with-LLVM-exception | |
| from pdfminer.pdfpage import PDFParser | |
| from pdfminer.pdfpage import PDFDocument | |
| from pdfminer.pdfpage import PDFPage | |
| from pdfminer.layout import LTTextBoxHorizontal | |
| from pdfminer.layout import LTTextLineHorizontal | |
| from pdfminer.layout import LTChar | |
| from pdfminer.layout import LAParams | |
| from pdfminer.layout import LTRect | |
| from pdfminer.layout import LTFigure | |
| from pdfminer.converter import PDFPageAggregator | |
| from pdfminer.pdfinterp import PDFResourceManager | |
| from pdfminer.pdfinterp import PDFPageInterpreter | |
| from pdfminer import pdfinterp | |
| from collections.abc import Iterable | |
| from collections import Counter | |
| from collections import OrderedDict | |
| import os | |
| # This is use for highlighting in PDFs | |
| from PyPDF2.generic import ( | |
| DictionaryObject, | |
| NumberObject, | |
| FloatObject, | |
| NameObject, | |
| TextStringObject, | |
| ArrayObject | |
| ) | |
| # Used to extract pages | |
| from PyPDF2 import PdfFileReader, PdfFileWriter | |
| def get_page_sizes(document): | |
| parser = PDFParser(open(document, 'rb')) | |
| doc = PDFDocument(parser) | |
| pageSizesList = [] | |
| for page in PDFPage.create_pages(doc): | |
| # the media box that is the page size as list of 4 integers x0 y0 x1 y1 | |
| pageSizesList.append(page.mediabox) # <- appending | |
| return pageSizesList | |
| def get_page_count(document): | |
| # Is there a better way of getting the page count than doing this? | |
| parser = PDFParser(document) | |
| tmpdoc = PDFDocument(parser) | |
| page_count = pdfinterp.resolve1(tmpdoc.catalog['Pages'])['Count'] | |
| return page_count | |
| def get_pdf_page_count(filename): | |
| with open(filename, 'rb') as document: | |
| return get_page_count(document) | |
| def get_pages(document, page_numbers = None): | |
| #Create resource manager | |
| rsrcmgr = PDFResourceManager() | |
| # Set parameters for analysis. | |
| laparams = LAParams() | |
| # Create a PDF page aggregator object. | |
| device = PDFPageAggregator(rsrcmgr, laparams=laparams) | |
| interpreter = PDFPageInterpreter(rsrcmgr, device) | |
| page_count = get_page_count(document) | |
| if page_numbers is None: | |
| page_numbers = range(page_count) | |
| for page, page_number in zip(PDFPage.get_pages(document, page_numbers), page_numbers): | |
| interpreter.process_page(page) | |
| # receive the LTPage object for the page. | |
| layout = device.get_result() | |
| #print("Yield page:", page_number) | |
| yield layout, page_number | |
| def partial_overlaps(box, other): | |
| """ | |
| Determine if the two bounding boxes overlap eachother. | |
| TODO: Really should just use a standard Python library for this. | |
| box -- 2 coordinate bounding box (x1,y1,x2,y2) | |
| other -- 2 coordinate bounding box (x1,y1,x2,y2) | |
| """ | |
| # a1 x1 a2 x2 | |
| # <------------------> | |
| x_intersects = (other[0] < box[0] and other[2] > box[0]) or ( | |
| other[0] < box[2] and other[2] > box[2]) | |
| y_intersects = (other[1] < box[1] and other[3] > box[1]) or ( | |
| other[1] < box[3] and other[3] > box[3]) | |
| intersects = x_intersects or y_intersects | |
| # TODO: Simplify? | |
| return intersects and overlaps(box, other) | |
| #return intersects | |
| def overlaps(box, other): | |
| """ | |
| Determine if the two bounding boxes overlap eachother. | |
| TODO: Really should just use a standard Python library for this. | |
| box -- 2 coordinate bounding box (x1,y1,x2,y2) | |
| other -- 2 coordinate bounding box (x1,y1,x2,y2) | |
| """ | |
| x_intersects = box[0] > other[2] or box[2] < other[0] | |
| y_intersects = box[1] > other[3] or box[3] < other[1] | |
| intersects = not (x_intersects or y_intersects) | |
| return intersects | |
| def union(src, other): | |
| """ | |
| Expand src by union of other bbox | |
| src -- 2 coordinate bounding box (x1,y1,x2,y2) | |
| other -- 2 coordinate bounding box (x1,y1,x2,y2) | |
| returns union of src and other | |
| """ | |
| xmin = min(src[0], other[0]) | |
| ymin = min(src[1], other[1]) | |
| xmax = max(src[2], other[2]) | |
| ymax = max(src[3], other[3]) | |
| return [xmin, ymin, xmax, ymax] | |
| # See: https://gist.github.com/agentcooper/4c55133f5d95866acdee5017cd318558#file-pypdf2highlight-py | |
| # x1, y1 starts in bottom left corner | |
| def createHighlight(x1, y1, x2, y2, meta, color = [1, 0, 0]): | |
| newHighlight = DictionaryObject() | |
| newHighlight.update({ | |
| NameObject("/F"): NumberObject(4), | |
| NameObject("/Type"): NameObject("/Annot"), | |
| NameObject("/Subtype"): NameObject("/Highlight"), | |
| NameObject("/T"): TextStringObject(meta["author"]), | |
| NameObject("/Contents"): TextStringObject(meta["contents"]), | |
| NameObject("/C"): ArrayObject([FloatObject(c) for c in color]), | |
| NameObject("/Rect"): ArrayObject([ | |
| FloatObject(x1), | |
| FloatObject(y1), | |
| FloatObject(x2), | |
| FloatObject(y2) | |
| ]), | |
| NameObject("/QuadPoints"): ArrayObject([ | |
| FloatObject(x1), | |
| FloatObject(y2), | |
| FloatObject(x2), | |
| FloatObject(y2), | |
| FloatObject(x1), | |
| FloatObject(y1), | |
| FloatObject(x2), | |
| FloatObject(y1) | |
| ]), | |
| }) | |
| return newHighlight | |
| def addHighlightToPage(highlight, page, output): | |
| highlight_ref = output._addObject(highlight); | |
| if "/Annots" in page: | |
| page[NameObject("/Annots")].append(highlight_ref) | |
| else: | |
| page[NameObject("/Annots")] = ArrayObject([highlight_ref]) | |
| def get_pdf_words(document, page_numbers=None): | |
| """ | |
| Get all words from LTChar or LTTextLineHorizontal objects from the document. | |
| :param document: string path of the PDF file to process | |
| :returns: A map of page #'s containing lists of coordinates and PDFMiner | |
| objects. Ex.: {page_number: [[x1, y1, x2, y2, <LTTextLineHorizontal>],]} | |
| """ | |
| pdf_doc = open(document, 'rb') | |
| bboxes = {} | |
| for layout, page in get_pages(pdf_doc, page_numbers): | |
| #print(element.get_text()) | |
| bboxes[page] = [] | |
| for element in layout: | |
| if not isinstance(element, Iterable): | |
| continue # not iterable | |
| for subElement in element: | |
| #print('Subelement type:', type(subElement)) | |
| if isinstance(subElement, LTChar): | |
| if (subElement.get_text() == ' '): | |
| pass # TODO: Handle word deliminator | |
| # Print the character in this class | |
| # print(subElement.get_text(), end='') | |
| item = list(subElement.bbox) | |
| item.append(subElement) | |
| bboxes[page].append(item) | |
| elif isinstance(subElement, LTTextLineHorizontal): | |
| #print(subElement.bbox) | |
| item = list(subElement.bbox) | |
| item.append(subElement) | |
| bboxes[page].append(item) | |
| else: | |
| pass | |
| return bboxes | |
| def get_paragraphs(words): | |
| paragraph_tolerance = 0.1 | |
| max_height_diff = 1 | |
| paragraphs = [] | |
| for page, elements in words.items(): | |
| # Find nominal font size | |
| # Round to int | |
| freq = Counter() | |
| for element in elements: | |
| height = int(element[3] - element[1]) | |
| #print(height,end=' ') | |
| freq[height] += 1 | |
| nominal_font = freq.most_common(1)[0][0] | |
| print("Nominal font is:", nominal_font) | |
| print("Page:", page) | |
| x_offset_prev_line = None | |
| prev_x_offset = None | |
| prev_y_offset = None | |
| paragraph_content = "" | |
| #print("Element count:", len(elements)) | |
| first_line = False | |
| processed_first_line = False | |
| for element in elements: | |
| x_offset = element[0] | |
| y_offset = element[1] | |
| height = int(element[3] - element[1]) | |
| text = element[4].get_text() | |
| if x_offset_prev_line != None: | |
| large_x_offset = (abs(x_offset_prev_line - x_offset) > paragraph_tolerance) | |
| # Font size mismatch? | |
| if abs(height - nominal_font) > max_height_diff: | |
| if len(paragraph_content) > 0: | |
| print("Content append:", len(paragraph_content)) | |
| paragraphs.append(paragraph_content) | |
| paragraph_content = "" | |
| print("Continue due to height != nominal_font") | |
| continue | |
| print("ELEMENT:", element[0:4], text[0:15]) | |
| if prev_y_offset is not None and len(paragraph_content) > 0: | |
| if y_offset < prev_y_offset - height * 1.5: | |
| print("Content append:", len(paragraph_content)) | |
| if len(paragraph_content) > 0: | |
| paragraphs.append(paragraph_content) | |
| paragraph_content = text | |
| prev_y_offset = None | |
| continue | |
| prev_y_offset = y_offset | |
| prev_y_offset = y_offset | |
| #print("element:", element) | |
| if not isinstance(element[4], LTTextLineHorizontal): | |
| continue | |
| #print("Running text:", text) | |
| #print(f"x_offset_prev_line , x_offset]: {x_offset_prev_line, x_offset}") | |
| # Find first paragraph | |
| if x_offset_prev_line is None: | |
| #print("x_offset_prev is none") | |
| x_offset_prev_line = x_offset | |
| if not processed_first_line: | |
| first_line = True | |
| processed_first_line = True | |
| if height == nominal_font: | |
| paragraph_content += text | |
| #print("Continue due to x_offset_prev_line is none") | |
| continue | |
| # Check case if first line was indented | |
| if x_offset_prev_line > x_offset and first_line: | |
| #print("x_offset < element[0]") | |
| first_line = False | |
| paragraph_content += text | |
| x_offset_prev_line = x_offset | |
| #print("Continue due to x_offset_prev_line > x_offset and first_line") | |
| continue | |
| # is this indented? | |
| # and ignore small changes | |
| if x_offset_prev_line < x_offset and large_x_offset: | |
| #print(f"x_offset_prev_line > x_offset: {x_offset_prev_line, x_offset}") | |
| if height == nominal_font and len(paragraph_content) > 0: | |
| paragraphs.append(paragraph_content) | |
| paragraph_content = text | |
| # Reset at next line read | |
| # What if next paragraph is also indented??? | |
| x_offset_prev_line = None | |
| #print("Continue due to x_offset_prev_line < x_offset and large_x_offset") | |
| continue | |
| #print(element[0:4]) | |
| if height == nominal_font: | |
| paragraph_content += text | |
| #print("End of loop") | |
| # TODO: Remove redundant space | |
| if paragraph_content != "": | |
| paragraphs.append(paragraph_content) | |
| # Find paragraph indexes | |
| c = 0 | |
| indexes = [] | |
| for p in paragraphs: | |
| c += len(p) | |
| indexes.append(c) | |
| return paragraphs, indexes | |
| def get_pdf_elements(document, element_type, page_numbers=None): | |
| pdf_doc = open(document, 'rb') | |
| items = {} | |
| for layout, page in get_pages(pdf_doc, page_numbers): | |
| #print(element.get_text()) | |
| items[page] = [] | |
| for element in layout: | |
| if isinstance(element, element_type): | |
| item = list(element.bbox) | |
| if hasattr(element, 'non_stroking_color'): | |
| item.append(element.non_stroking_color) | |
| items[page].append(item) | |
| print(items) | |
| return items | |
| def get_large_colored_background_rectangles(document, page_numbers=None): | |
| # Only include rectangles that are at least 4" x 1" in size | |
| min_size = (288.0, 72.0) | |
| elements = get_pdf_elements(document, LTRect, page_numbers) | |
| rects_out = {} | |
| for page, rects in elements.items(): | |
| print("Rects:", rects) | |
| for rect in rects: | |
| width = rect[2] - rect[0] | |
| height = rect[3] - rect[1] | |
| print("Dimensions:", width, height) | |
| if (width > min_size[0] and | |
| height > min_size[1]): | |
| if not page in rects_out: | |
| rects_out[page] = [] | |
| rects_out[page].append(rect) | |
| return rects_out | |
| def extract_pages(document, output, page_numbers=None): | |
| pdf = PdfFileReader(document) | |
| pdf_writer = PdfFileWriter() | |
| for page in page_numbers: | |
| current_page = pdf.getPage(page) | |
| pdf_writer.addPage(current_page) | |
| with open(output, "wb") as out: | |
| pdf_writer.write(out) | |