import os from typing import Optional, List, Dict, Any from google.api_core.client_options import ClientOptions from google.cloud import documentai # type: ignore from PIL import Image, ImageChops from io import BytesIO import fitz # PyMuPDF import base64 class GoogleDocumentAPI: def __init__(self, credentials_path: str): os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = credentials_path self.project_id = "649829115993" self.location = "us" # Format is "us" or "eu" self.processor_id = "7f9fd758484d83fe" # Only use this self.mime_type = "application/pdf" # Refer to https://cloud.google.com/document-ai/docs/file-types for supported file types def process_document(self, file_path: str, field_mask: Optional[str] = None, processor_version_id: Optional[str] = None) -> documentai.Document: opts = ClientOptions(api_endpoint=f"{self.location}-documentai.googleapis.com") client = documentai.DocumentProcessorServiceClient(client_options=opts) if processor_version_id: name = client.processor_version_path( self.project_id, self.location, self.processor_id, processor_version_id ) else: name = client.processor_path(self.project_id, self.location, self.processor_id) with open(file_path, "rb") as image: image_content = image.read() raw_document = documentai.RawDocument(content=image_content, mime_type=self.mime_type) process_options = documentai.ProcessOptions( individual_page_selector=documentai.ProcessOptions.IndividualPageSelector( pages=[1] ) ) request = documentai.ProcessRequest( name=name, raw_document=raw_document, field_mask=field_mask, process_options=process_options, ) result = client.process_document(request=request) return result.document def get_document_text(self, document: documentai.Document, page_number: int = 0) -> str: # Note: document.pages is 0-indexed. If you request page 1, it will be in document.pages[0] return document.pages[page_number].text @staticmethod def _get_style_info(text_anchor: documentai.Document.TextAnchor, document: documentai.Document) -> str: """Helper function to extract style information for a text anchor.""" if not hasattr(document, 'text_styles') or not document.text_styles: return "N/A" styles = [] # A text anchor can have multiple non-contiguous segments. for para_segment in text_anchor.text_segments: para_start = int(para_segment.start_index) para_end = int(para_segment.end_index) for style in document.text_styles: for style_segment in style.text_anchor.text_segments: style_start = int(style_segment.start_index) style_end = int(style_segment.end_index) # Check for overlap between the paragraph segment and the style segment if max(para_start, style_start) < min(para_end, style_end): style_str_parts = [] if style.font_size and style.font_size.size > 0: unit = style.font_size.unit if style.font_size.unit else 'pt' style_str_parts.append(f"font size: {round(style.font_size.size)}{unit}") if style.font_weight and style.font_weight.lower() != 'normal': style_str_parts.append(f"font weight: {style.font_weight}") if style.text_style and style.text_style.lower() != 'normal': style_str_parts.append(f"text style: {style.text_style}") if style.font_family: style_str_parts.append(f'font family: {style.font_family}') if style_str_parts: styles.append(" ".join(style_str_parts)) if styles: # Using dict.fromkeys to preserve order and get unique styles unique_styles = list(dict.fromkeys(styles)) return ", ".join(unique_styles) return "default" @staticmethod def _get_text(text_anchor: documentai.Document.TextAnchor, text: str) -> str: """Helper function to extract text from text_anchor.""" if not text_anchor.text_segments: return "" return "".join( text[int(segment.start_index) : int(segment.end_index)] for segment in text_anchor.text_segments ) def extract_text_with_bounding_boxes(self, document: documentai.Document) -> List[Dict[str, Any]]: """ Extracts text and bounding box for each paragraph in the document. Args: document: The processed documentai.Document object. Returns: A list of dictionaries, where each dictionary contains: - 'page_number': The page number (1-based). - 'text': The text of the paragraph. - 'bounding_box': A list of normalized vertices for the bounding box. - 'style': Style information for the text. - 'height': The height of the text block in millimeters (mm). """ all_paragraphs = [] full_text = document.text pt_to_mm = 0.3528 # Conversion factor from points to millimeters for page in document.pages: # Get page height in points for height calculation page_pts = page.dimension.height for paragraph in page.paragraphs: p_text = self._get_text(paragraph.layout.text_anchor, full_text) style_info = self._get_style_info(paragraph.layout.text_anchor, document) # Get the normalized vertices for the bounding box vertices = [ {"x": vertex.x, "y": vertex.y} for vertex in paragraph.layout.bounding_poly.normalized_vertices ] # Calculate height in millimeters y_coords = [vertex.y for vertex in paragraph.layout.bounding_poly.normalized_vertices] height_ratio = max(y_coords) - min(y_coords) height_pt = height_ratio * page_pts height_mm = height_pt * pt_to_mm all_paragraphs.append({ "page_number": page.page_number, "text": p_text.strip(), # Use .strip() to remove leading/trailing whitespace "bounding_box": vertices, "style": style_info, "height": round(height_mm, 2) }) return all_paragraphs def extract_text_with_markdown_table(self, document: documentai.Document) -> str: data = self.extract_text_with_bounding_boxes(document) return self._create_markdown_table(data) def _quantize_coord(self, val, grid_size=1000) -> int: """Converts a float (0-1) to an integer on a grid.""" return int(val * grid_size) def _create_markdown_table(self, data) -> str: table = "| Text ID | X | Y | Text Height (mm) | Style | Text |\\n" table += "|----|-----|-----|--------|-------|-------------------------------------------------------------------------|\\n" for i, item in enumerate(data): top_left = item['bounding_box'][0] x = self._quantize_coord(top_left['x']) y = self._quantize_coord(top_left['y']) height = round(item.get('height', 0), 2) style = item.get('style', 'N/A') text = item['text'].replace('\\n', ' ').replace('|', '\\\\|').strip() table += f"| {i+1} | {x} | {y} | {height} | {style} | {text} |\\n" return table def get_bounding_boxes(self, document: documentai.Document, page_number: int = 0) -> list[documentai.BoundingPoly]: """ Extracts bounding boxes for tokens on a specific page. """ page = document.pages[page_number] return [token.layout.bounding_poly for token in page.tokens] def extract_text_heights_mm(self, document: documentai.Document) -> List[tuple]: """ Extracts the height of each line of text from a Google Document AI parsed document and returns a list of heights in millimeters (mm). Parameters: document (google.cloud.documentai.Document): Parsed Document AI response object Returns: List of tuples: [(page_num, line_text, height_mm), ...] """ heights = [] pt_to_mm = 0.3528 for page_num, page in enumerate(document.pages, start=1): page_height_pt = page.dimension.height # e.g., 792 for US Letter for line in page.lines: layout = line.layout vertices = layout.bounding_poly.normalized_vertices y_coords = [v.y for v in vertices] if not y_coords: continue height_ratio = max(y_coords) - min(y_coords) height_pt = height_ratio * page_height_pt height_mm = height_pt * pt_to_mm # Extract visible text (optional — may require mapping segments) text_segment = layout.text_anchor.text_segments[0] start = int(text_segment.start_index) end = int(text_segment.end_index) line_text = document.text[start:end].strip() heights.append((page_num, line_text, round(height_mm, 2))) return heights