| | import base64 |
| | import re |
| | from tempfile import TemporaryDirectory |
| | from math import atan, cos, sin |
| | from typing import Dict, Optional, Tuple |
| | from xml.etree import ElementTree as ET |
| | from xml.etree.ElementTree import Element |
| |
|
| | import numpy as np |
| | import PyPDF2 |
| | from PyPDF2 import PdfFileMerger |
| | from doctr.io import DocumentFile |
| | from doctr.models import ocr_predictor |
| | from PIL import Image |
| | from reportlab.lib.colors import black |
| | from reportlab.lib.units import inch |
| | from reportlab.lib.utils import ImageReader |
| | from reportlab.pdfgen.canvas import Canvas |
| |
|
| |
|
| |
|
| |
|
| | class HocrParser(): |
| |
|
| | def __init__(self): |
| | self.box_pattern = re.compile(r'bbox((\s+\d+){4})') |
| | self.baseline_pattern = re.compile(r'baseline((\s+[\d\.\-]+){2})') |
| |
|
| | def _element_coordinates(self, element: Element) -> Dict: |
| | """ |
| | Returns a tuple containing the coordinates of the bounding box around |
| | an element |
| | """ |
| | out = out = {'x1': 0, 'y1': 0, 'x2': 0, 'y2': 0} |
| | if 'title' in element.attrib: |
| | matches = self.box_pattern.search(element.attrib['title']) |
| | if matches: |
| | coords = matches.group(1).split() |
| | out = {'x1': int(coords[0]), 'y1': int( |
| | coords[1]), 'x2': int(coords[2]), 'y2': int(coords[3])} |
| | return out |
| |
|
| | def _get_baseline(self, element: Element) -> Tuple[float, float]: |
| | """ |
| | Returns a tuple containing the baseline slope and intercept. |
| | """ |
| | if 'title' in element.attrib: |
| | matches = self.baseline_pattern.search( |
| | element.attrib['title']).group(1).split() |
| | if matches: |
| | return float(matches[0]), float(matches[1]) |
| | return (0.0, 0.0) |
| |
|
| | def _pt_from_pixel(self, pxl: Dict, dpi: int) -> Dict: |
| | """ |
| | Returns the quantity in PDF units (pt) given quantity in pixels |
| | """ |
| | pt = [(c / dpi * inch) for c in pxl.values()] |
| | return {'x1': pt[0], 'y1': pt[1], 'x2': pt[2], 'y2': pt[3]} |
| |
|
| | def _get_element_text(self, element: Element) -> str: |
| | """ |
| | Return the textual content of the element and its children |
| | """ |
| | text = '' |
| | if element.text is not None: |
| | text += element.text |
| | for child in element: |
| | text += self._get_element_text(child) |
| | if element.tail is not None: |
| | text += element.tail |
| | return text |
| |
|
| | def export_pdfa(self, |
| | out_filename: str, |
| | hocr: ET.ElementTree, |
| | image: Optional[np.ndarray] = None, |
| | fontname: str = "Times-Roman", |
| | fontsize: int = 12, |
| | invisible_text: bool = True, |
| | add_spaces: bool = True, |
| | dpi: int = 300): |
| | """ |
| | Generates a PDF/A document from a hOCR document. |
| | """ |
| |
|
| | width, height = None, None |
| | |
| | for div in hocr.findall(".//div[@class='ocr_page']"): |
| | coords = self._element_coordinates(div) |
| | pt_coords = self._pt_from_pixel(coords, dpi) |
| | width, height = pt_coords['x2'] - \ |
| | pt_coords['x1'], pt_coords['y2'] - pt_coords['y1'] |
| | |
| | break |
| | if width is None or height is None: |
| | raise ValueError("Could not determine page size") |
| |
|
| | pdf = Canvas(out_filename, pagesize=(width, height), pageCompression=1) |
| |
|
| | span_elements = [element for element in hocr.iterfind(".//span")] |
| | for line in span_elements: |
| | if 'class' in line.attrib and line.attrib['class'] == 'ocr_line' and line is not None: |
| | |
| | pxl_line_coords = self._element_coordinates(line) |
| | line_box = self._pt_from_pixel(pxl_line_coords, dpi) |
| |
|
| | |
| | slope, pxl_intercept = self._get_baseline(line) |
| | if abs(slope) < 0.005: |
| | slope = 0.0 |
| | angle = atan(slope) |
| | cos_a, sin_a = cos(angle), sin(angle) |
| | intercept = pxl_intercept / dpi * inch |
| | baseline_y2 = height - (line_box['y2'] + intercept) |
| |
|
| | |
| | text = pdf.beginText() |
| | text.setFont(fontname, fontsize) |
| | pdf.setFillColor(black) |
| | if invisible_text: |
| | text.setTextRenderMode(3) |
| |
|
| | |
| | text.setTextTransform( |
| | cos_a, -sin_a, sin_a, cos_a, line_box['x1'], baseline_y2) |
| |
|
| | elements = line.findall(".//span[@class='ocrx_word']") |
| | for elem in elements: |
| | elemtxt = self._get_element_text(elem).strip() |
| | |
| | elemtxt = elemtxt.translate(str.maketrans( |
| | {'ff': 'ff', 'ffi': 'ffi', 'ffl': 'ffl', 'fi': 'fi', 'fl': 'fl'})) |
| | if not elemtxt: |
| | continue |
| |
|
| | |
| | pxl_coords = self._element_coordinates(elem) |
| | box = self._pt_from_pixel(pxl_coords, dpi) |
| | if add_spaces: |
| | elemtxt += ' ' |
| | box_width = box['x2'] + pdf.stringWidth(elemtxt, fontname, fontsize) - box['x1'] |
| | else: |
| | box_width = box['x2'] - box['x1'] |
| | font_width = pdf.stringWidth(elemtxt, fontname, fontsize) |
| |
|
| | |
| | cursor = text.getStartOfLine() |
| | dx = box['x1'] - cursor[0] |
| | dy = baseline_y2 - cursor[1] |
| | text.moveCursor(dx, dy) |
| |
|
| | |
| | if font_width > 0: |
| | text.setHorizScale(100 * box_width / font_width) |
| | text.textOut(elemtxt) |
| | pdf.drawText(text) |
| |
|
| | |
| | if image is not None: |
| | pdf.drawImage(ImageReader(Image.fromarray(image)), |
| | 0, 0, width=width, height=height) |
| | pdf.save() |