| | import os |
| | from typing import Optional, List, Dict, Any |
| | from google.api_core.client_options import ClientOptions |
| | from google.cloud import documentai |
| | from PIL import Image, ImageChops |
| | from io import BytesIO |
| | import fitz |
| | import base64 |
| |
|
| | class GoogleDocumentAPI: |
| | def __init__(self, credentials_path: str): |
| | os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = credentials_path |
| | |
| | self.project_id = "649829115993" |
| | self.location = "us" |
| | self.processor_id = "7f9fd758484d83fe" |
| | self.mime_type = "application/pdf" |
| |
|
| | def process_document(self, file_path: str, field_mask: Optional[str] = None, processor_version_id: Optional[str] = None) -> documentai.Document: |
| | opts = ClientOptions(api_endpoint=f"{self.location}-documentai.googleapis.com") |
| | client = documentai.DocumentProcessorServiceClient(client_options=opts) |
| |
|
| | if processor_version_id: |
| | name = client.processor_version_path( |
| | self.project_id, self.location, self.processor_id, processor_version_id |
| | ) |
| | else: |
| | name = client.processor_path(self.project_id, self.location, self.processor_id) |
| |
|
| | with open(file_path, "rb") as image: |
| | image_content = image.read() |
| |
|
| | raw_document = documentai.RawDocument(content=image_content, mime_type=self.mime_type) |
| |
|
| | process_options = documentai.ProcessOptions( |
| | individual_page_selector=documentai.ProcessOptions.IndividualPageSelector( |
| | pages=[1] |
| | ) |
| | ) |
| |
|
| | request = documentai.ProcessRequest( |
| | name=name, |
| | raw_document=raw_document, |
| | field_mask=field_mask, |
| | process_options=process_options, |
| | ) |
| |
|
| | result = client.process_document(request=request) |
| | return result.document |
| |
|
| | def get_document_text(self, document: documentai.Document, page_number: int = 0) -> str: |
| | |
| | return document.pages[page_number].text |
| |
|
| | @staticmethod |
| | def _get_style_info(text_anchor: documentai.Document.TextAnchor, document: documentai.Document) -> str: |
| | """Helper function to extract style information for a text anchor.""" |
| | if not hasattr(document, 'text_styles') or not document.text_styles: |
| | return "N/A" |
| |
|
| | styles = [] |
| | |
| | for para_segment in text_anchor.text_segments: |
| | para_start = int(para_segment.start_index) |
| | para_end = int(para_segment.end_index) |
| |
|
| | for style in document.text_styles: |
| | for style_segment in style.text_anchor.text_segments: |
| | style_start = int(style_segment.start_index) |
| | style_end = int(style_segment.end_index) |
| |
|
| | |
| | if max(para_start, style_start) < min(para_end, style_end): |
| | style_str_parts = [] |
| | if style.font_size and style.font_size.size > 0: |
| | unit = style.font_size.unit if style.font_size.unit else 'pt' |
| | style_str_parts.append(f"font size: {round(style.font_size.size)}{unit}") |
| | if style.font_weight and style.font_weight.lower() != 'normal': |
| | style_str_parts.append(f"font weight: {style.font_weight}") |
| | if style.text_style and style.text_style.lower() != 'normal': |
| | style_str_parts.append(f"text style: {style.text_style}") |
| | if style.font_family: |
| | style_str_parts.append(f'font family: {style.font_family}') |
| | |
| | if style_str_parts: |
| | styles.append(" ".join(style_str_parts)) |
| | |
| | if styles: |
| | |
| | unique_styles = list(dict.fromkeys(styles)) |
| | return ", ".join(unique_styles) |
| | |
| | return "default" |
| |
|
| | @staticmethod |
| | def _get_text(text_anchor: documentai.Document.TextAnchor, text: str) -> str: |
| | """Helper function to extract text from text_anchor.""" |
| | if not text_anchor.text_segments: |
| | return "" |
| | return "".join( |
| | text[int(segment.start_index) : int(segment.end_index)] |
| | for segment in text_anchor.text_segments |
| | ) |
| |
|
| | def extract_text_with_bounding_boxes(self, document: documentai.Document) -> List[Dict[str, Any]]: |
| | """ |
| | Extracts text and bounding box for each paragraph in the document. |
| | |
| | Args: |
| | document: The processed documentai.Document object. |
| | |
| | Returns: |
| | A list of dictionaries, where each dictionary contains: |
| | - 'page_number': The page number (1-based). |
| | - 'text': The text of the paragraph. |
| | - 'bounding_box': A list of normalized vertices for the bounding box. |
| | - 'style': Style information for the text. |
| | - 'height': The height of the text block in millimeters (mm). |
| | """ |
| | all_paragraphs = [] |
| | full_text = document.text |
| | pt_to_mm = 0.3528 |
| |
|
| | for page in document.pages: |
| | |
| | page_pts = page.dimension.height |
| | |
| | for paragraph in page.paragraphs: |
| | p_text = self._get_text(paragraph.layout.text_anchor, full_text) |
| | style_info = self._get_style_info(paragraph.layout.text_anchor, document) |
| | |
| | |
| | vertices = [ |
| | {"x": vertex.x, "y": vertex.y} |
| | for vertex in paragraph.layout.bounding_poly.normalized_vertices |
| | ] |
| | |
| | |
| | y_coords = [vertex.y for vertex in paragraph.layout.bounding_poly.normalized_vertices] |
| | height_ratio = max(y_coords) - min(y_coords) |
| | height_pt = height_ratio * page_pts |
| | height_mm = height_pt * pt_to_mm |
| |
|
| | all_paragraphs.append({ |
| | "page_number": page.page_number, |
| | "text": p_text.strip(), |
| | "bounding_box": vertices, |
| | "style": style_info, |
| | "height": round(height_mm, 2) |
| | }) |
| | return all_paragraphs |
| | |
| |
|
| |
|
| |
|
| | def extract_text_with_markdown_table(self, document: documentai.Document) -> str: |
| | data = self.extract_text_with_bounding_boxes(document) |
| | return self._create_markdown_table(data) |
| | |
| | def _quantize_coord(self, val, grid_size=1000) -> int: |
| | """Converts a float (0-1) to an integer on a grid.""" |
| | return int(val * grid_size) |
| |
|
| | def _create_markdown_table(self, data) -> str: |
| | table = "| Text ID | X | Y | Text Height (mm) | Style | Text |\\n" |
| | table += "|----|-----|-----|--------|-------|-------------------------------------------------------------------------|\\n" |
| | for i, item in enumerate(data): |
| | top_left = item['bounding_box'][0] |
| | x = self._quantize_coord(top_left['x']) |
| | y = self._quantize_coord(top_left['y']) |
| | height = round(item.get('height', 0), 2) |
| | style = item.get('style', 'N/A') |
| | text = item['text'].replace('\\n', ' ').replace('|', '\\\\|').strip() |
| | table += f"| {i+1} | {x} | {y} | {height} | {style} | {text} |\\n" |
| | return table |
| |
|
| | def get_bounding_boxes(self, document: documentai.Document, page_number: int = 0) -> list[documentai.BoundingPoly]: |
| | """ |
| | Extracts bounding boxes for tokens on a specific page. |
| | """ |
| | page = document.pages[page_number] |
| | return [token.layout.bounding_poly for token in page.tokens] |
| |
|
| | def extract_text_heights_mm(self, document: documentai.Document) -> List[tuple]: |
| | """ |
| | Extracts the height of each line of text from a Google Document AI parsed document |
| | and returns a list of heights in millimeters (mm). |
| | |
| | Parameters: |
| | document (google.cloud.documentai.Document): Parsed Document AI response object |
| | |
| | Returns: |
| | List of tuples: [(page_num, line_text, height_mm), ...] |
| | """ |
| | heights = [] |
| | pt_to_mm = 0.3528 |
| |
|
| | for page_num, page in enumerate(document.pages, start=1): |
| | page_height_pt = page.dimension.height |
| |
|
| | for line in page.lines: |
| | layout = line.layout |
| | vertices = layout.bounding_poly.normalized_vertices |
| |
|
| | y_coords = [v.y for v in vertices] |
| | if not y_coords: |
| | continue |
| |
|
| | height_ratio = max(y_coords) - min(y_coords) |
| | height_pt = height_ratio * page_height_pt |
| | height_mm = height_pt * pt_to_mm |
| |
|
| | |
| | text_segment = layout.text_anchor.text_segments[0] |
| | start = int(text_segment.start_index) |
| | end = int(text_segment.end_index) |
| | line_text = document.text[start:end].strip() |
| |
|
| | heights.append((page_num, line_text, round(height_mm, 2))) |
| |
|
| | return heights |
| |
|
| |
|
| | |