File size: 9,930 Bytes
863cb78 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 | import os
from typing import Optional, List, Dict, Any
from google.api_core.client_options import ClientOptions
from google.cloud import documentai # type: ignore
from PIL import Image, ImageChops
from io import BytesIO
import fitz # PyMuPDF
import base64
class GoogleDocumentAPI:
def __init__(self, credentials_path: str):
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = credentials_path
self.project_id = "649829115993"
self.location = "us" # Format is "us" or "eu"
self.processor_id = "7f9fd758484d83fe" # Only use this
self.mime_type = "application/pdf" # Refer to https://cloud.google.com/document-ai/docs/file-types for supported file types
def process_document(self, file_path: str, field_mask: Optional[str] = None, processor_version_id: Optional[str] = None) -> documentai.Document:
opts = ClientOptions(api_endpoint=f"{self.location}-documentai.googleapis.com")
client = documentai.DocumentProcessorServiceClient(client_options=opts)
if processor_version_id:
name = client.processor_version_path(
self.project_id, self.location, self.processor_id, processor_version_id
)
else:
name = client.processor_path(self.project_id, self.location, self.processor_id)
with open(file_path, "rb") as image:
image_content = image.read()
raw_document = documentai.RawDocument(content=image_content, mime_type=self.mime_type)
process_options = documentai.ProcessOptions(
individual_page_selector=documentai.ProcessOptions.IndividualPageSelector(
pages=[1]
)
)
request = documentai.ProcessRequest(
name=name,
raw_document=raw_document,
field_mask=field_mask,
process_options=process_options,
)
result = client.process_document(request=request)
return result.document
def get_document_text(self, document: documentai.Document, page_number: int = 0) -> str:
# Note: document.pages is 0-indexed. If you request page 1, it will be in document.pages[0]
return document.pages[page_number].text
@staticmethod
def _get_style_info(text_anchor: documentai.Document.TextAnchor, document: documentai.Document) -> str:
"""Helper function to extract style information for a text anchor."""
if not hasattr(document, 'text_styles') or not document.text_styles:
return "N/A"
styles = []
# A text anchor can have multiple non-contiguous segments.
for para_segment in text_anchor.text_segments:
para_start = int(para_segment.start_index)
para_end = int(para_segment.end_index)
for style in document.text_styles:
for style_segment in style.text_anchor.text_segments:
style_start = int(style_segment.start_index)
style_end = int(style_segment.end_index)
# Check for overlap between the paragraph segment and the style segment
if max(para_start, style_start) < min(para_end, style_end):
style_str_parts = []
if style.font_size and style.font_size.size > 0:
unit = style.font_size.unit if style.font_size.unit else 'pt'
style_str_parts.append(f"font size: {round(style.font_size.size)}{unit}")
if style.font_weight and style.font_weight.lower() != 'normal':
style_str_parts.append(f"font weight: {style.font_weight}")
if style.text_style and style.text_style.lower() != 'normal':
style_str_parts.append(f"text style: {style.text_style}")
if style.font_family:
style_str_parts.append(f'font family: {style.font_family}')
if style_str_parts:
styles.append(" ".join(style_str_parts))
if styles:
# Using dict.fromkeys to preserve order and get unique styles
unique_styles = list(dict.fromkeys(styles))
return ", ".join(unique_styles)
return "default"
@staticmethod
def _get_text(text_anchor: documentai.Document.TextAnchor, text: str) -> str:
"""Helper function to extract text from text_anchor."""
if not text_anchor.text_segments:
return ""
return "".join(
text[int(segment.start_index) : int(segment.end_index)]
for segment in text_anchor.text_segments
)
def extract_text_with_bounding_boxes(self, document: documentai.Document) -> List[Dict[str, Any]]:
"""
Extracts text and bounding box for each paragraph in the document.
Args:
document: The processed documentai.Document object.
Returns:
A list of dictionaries, where each dictionary contains:
- 'page_number': The page number (1-based).
- 'text': The text of the paragraph.
- 'bounding_box': A list of normalized vertices for the bounding box.
- 'style': Style information for the text.
- 'height': The height of the text block in millimeters (mm).
"""
all_paragraphs = []
full_text = document.text
pt_to_mm = 0.3528 # Conversion factor from points to millimeters
for page in document.pages:
# Get page height in points for height calculation
page_pts = page.dimension.height
for paragraph in page.paragraphs:
p_text = self._get_text(paragraph.layout.text_anchor, full_text)
style_info = self._get_style_info(paragraph.layout.text_anchor, document)
# Get the normalized vertices for the bounding box
vertices = [
{"x": vertex.x, "y": vertex.y}
for vertex in paragraph.layout.bounding_poly.normalized_vertices
]
# Calculate height in millimeters
y_coords = [vertex.y for vertex in paragraph.layout.bounding_poly.normalized_vertices]
height_ratio = max(y_coords) - min(y_coords)
height_pt = height_ratio * page_pts
height_mm = height_pt * pt_to_mm
all_paragraphs.append({
"page_number": page.page_number,
"text": p_text.strip(), # Use .strip() to remove leading/trailing whitespace
"bounding_box": vertices,
"style": style_info,
"height": round(height_mm, 2)
})
return all_paragraphs
def extract_text_with_markdown_table(self, document: documentai.Document) -> str:
data = self.extract_text_with_bounding_boxes(document)
return self._create_markdown_table(data)
def _quantize_coord(self, val, grid_size=1000) -> int:
"""Converts a float (0-1) to an integer on a grid."""
return int(val * grid_size)
def _create_markdown_table(self, data) -> str:
table = "| Text ID | X | Y | Text Height (mm) | Style | Text |\\n"
table += "|----|-----|-----|--------|-------|-------------------------------------------------------------------------|\\n"
for i, item in enumerate(data):
top_left = item['bounding_box'][0]
x = self._quantize_coord(top_left['x'])
y = self._quantize_coord(top_left['y'])
height = round(item.get('height', 0), 2)
style = item.get('style', 'N/A')
text = item['text'].replace('\\n', ' ').replace('|', '\\\\|').strip()
table += f"| {i+1} | {x} | {y} | {height} | {style} | {text} |\\n"
return table
def get_bounding_boxes(self, document: documentai.Document, page_number: int = 0) -> list[documentai.BoundingPoly]:
"""
Extracts bounding boxes for tokens on a specific page.
"""
page = document.pages[page_number]
return [token.layout.bounding_poly for token in page.tokens]
def extract_text_heights_mm(self, document: documentai.Document) -> List[tuple]:
"""
Extracts the height of each line of text from a Google Document AI parsed document
and returns a list of heights in millimeters (mm).
Parameters:
document (google.cloud.documentai.Document): Parsed Document AI response object
Returns:
List of tuples: [(page_num, line_text, height_mm), ...]
"""
heights = []
pt_to_mm = 0.3528
for page_num, page in enumerate(document.pages, start=1):
page_height_pt = page.dimension.height # e.g., 792 for US Letter
for line in page.lines:
layout = line.layout
vertices = layout.bounding_poly.normalized_vertices
y_coords = [v.y for v in vertices]
if not y_coords:
continue
height_ratio = max(y_coords) - min(y_coords)
height_pt = height_ratio * page_height_pt
height_mm = height_pt * pt_to_mm
# Extract visible text (optional — may require mapping segments)
text_segment = layout.text_anchor.text_segments[0]
start = int(text_segment.start_index)
end = int(text_segment.end_index)
line_text = document.text[start:end].strip()
heights.append((page_num, line_text, round(height_mm, 2)))
return heights
|