File size: 9,930 Bytes
863cb78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
import os
from typing import Optional, List, Dict, Any
from google.api_core.client_options import ClientOptions
from google.cloud import documentai  # type: ignore
from PIL import Image, ImageChops
from io import BytesIO
import fitz  # PyMuPDF
import base64

class GoogleDocumentAPI:
    def __init__(self, credentials_path: str):
        os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = credentials_path
        
        self.project_id = "649829115993"
        self.location = "us"  # Format is "us" or "eu"
        self.processor_id = "7f9fd758484d83fe"  # Only use this
        self.mime_type = "application/pdf"  # Refer to https://cloud.google.com/document-ai/docs/file-types for supported file types

    def process_document(self, file_path: str, field_mask: Optional[str] = None, processor_version_id: Optional[str] = None) -> documentai.Document:
        opts = ClientOptions(api_endpoint=f"{self.location}-documentai.googleapis.com")
        client = documentai.DocumentProcessorServiceClient(client_options=opts)

        if processor_version_id:
            name = client.processor_version_path(
                self.project_id, self.location, self.processor_id, processor_version_id
            )
        else:
            name = client.processor_path(self.project_id, self.location, self.processor_id)

        with open(file_path, "rb") as image:
            image_content = image.read()

        raw_document = documentai.RawDocument(content=image_content, mime_type=self.mime_type)

        process_options = documentai.ProcessOptions(
            individual_page_selector=documentai.ProcessOptions.IndividualPageSelector(
                pages=[1]
            )
        )

        request = documentai.ProcessRequest(
            name=name,
            raw_document=raw_document,
            field_mask=field_mask,
            process_options=process_options,
        )

        result = client.process_document(request=request)
        return result.document

    def get_document_text(self, document: documentai.Document, page_number: int = 0) -> str:
        # Note: document.pages is 0-indexed. If you request page 1, it will be in document.pages[0]
        return document.pages[page_number].text

    @staticmethod
    def _get_style_info(text_anchor: documentai.Document.TextAnchor, document: documentai.Document) -> str:
        """Helper function to extract style information for a text anchor."""
        if not hasattr(document, 'text_styles') or not document.text_styles:
            return "N/A"

        styles = []
        # A text anchor can have multiple non-contiguous segments.
        for para_segment in text_anchor.text_segments:
            para_start = int(para_segment.start_index)
            para_end = int(para_segment.end_index)

            for style in document.text_styles:
                for style_segment in style.text_anchor.text_segments:
                    style_start = int(style_segment.start_index)
                    style_end = int(style_segment.end_index)

                    # Check for overlap between the paragraph segment and the style segment
                    if max(para_start, style_start) < min(para_end, style_end):
                        style_str_parts = []
                        if style.font_size and style.font_size.size > 0:
                            unit = style.font_size.unit if style.font_size.unit else 'pt'
                            style_str_parts.append(f"font size: {round(style.font_size.size)}{unit}")
                        if style.font_weight and style.font_weight.lower() != 'normal':
                            style_str_parts.append(f"font weight: {style.font_weight}")
                        if style.text_style and style.text_style.lower() != 'normal':
                            style_str_parts.append(f"text style: {style.text_style}")
                        if style.font_family:
                            style_str_parts.append(f'font family: {style.font_family}')
                        
                        if style_str_parts:
                            styles.append(" ".join(style_str_parts))
        
        if styles:
            # Using dict.fromkeys to preserve order and get unique styles
            unique_styles = list(dict.fromkeys(styles))
            return ", ".join(unique_styles)
        
        return "default"

    @staticmethod
    def _get_text(text_anchor: documentai.Document.TextAnchor, text: str) -> str:
        """Helper function to extract text from text_anchor."""
        if not text_anchor.text_segments:
            return ""
        return "".join(
            text[int(segment.start_index) : int(segment.end_index)]
            for segment in text_anchor.text_segments
        )

    def extract_text_with_bounding_boxes(self, document: documentai.Document) -> List[Dict[str, Any]]:
        """
        Extracts text and bounding box for each paragraph in the document.
        
        Args:
            document: The processed documentai.Document object.
            
        Returns:
            A list of dictionaries, where each dictionary contains:
            - 'page_number': The page number (1-based).
            - 'text': The text of the paragraph.
            - 'bounding_box': A list of normalized vertices for the bounding box.
            - 'style': Style information for the text.
            - 'height': The height of the text block in millimeters (mm).
        """
        all_paragraphs = []
        full_text = document.text
        pt_to_mm = 0.3528  # Conversion factor from points to millimeters

        for page in document.pages:
            # Get page height in points for height calculation
            page_pts = page.dimension.height
            
            for paragraph in page.paragraphs:
                p_text = self._get_text(paragraph.layout.text_anchor, full_text)
                style_info = self._get_style_info(paragraph.layout.text_anchor, document)
                
                # Get the normalized vertices for the bounding box
                vertices = [
                    {"x": vertex.x, "y": vertex.y}
                    for vertex in paragraph.layout.bounding_poly.normalized_vertices
                ]
                
                # Calculate height in millimeters
                y_coords = [vertex.y for vertex in paragraph.layout.bounding_poly.normalized_vertices]
                height_ratio = max(y_coords) - min(y_coords)
                height_pt = height_ratio * page_pts
                height_mm = height_pt * pt_to_mm

                all_paragraphs.append({
                    "page_number": page.page_number,
                    "text": p_text.strip(), # Use .strip() to remove leading/trailing whitespace
                    "bounding_box": vertices,
                    "style": style_info,
                    "height": round(height_mm, 2)
                })
        return all_paragraphs
        



    def extract_text_with_markdown_table(self, document: documentai.Document) -> str:
        data = self.extract_text_with_bounding_boxes(document)
        return self._create_markdown_table(data)
    
    def _quantize_coord(self, val, grid_size=1000) -> int:
        """Converts a float (0-1) to an integer on a grid."""
        return int(val * grid_size)

    def _create_markdown_table(self, data) -> str:
        table = "| Text ID | X   | Y   | Text Height (mm) | Style | Text                                                           |\\n"
        table += "|----|-----|-----|--------|-------|-------------------------------------------------------------------------|\\n"
        for i, item in enumerate(data):
            top_left = item['bounding_box'][0]
            x = self._quantize_coord(top_left['x'])
            y = self._quantize_coord(top_left['y'])
            height = round(item.get('height', 0), 2)
            style = item.get('style', 'N/A')
            text = item['text'].replace('\\n', ' ').replace('|', '\\\\|').strip()
            table += f"| {i+1} | {x} | {y} | {height} | {style} | {text} |\\n"
        return table

    def get_bounding_boxes(self, document: documentai.Document, page_number: int = 0) -> list[documentai.BoundingPoly]:
        """
        Extracts bounding boxes for tokens on a specific page.
        """
        page = document.pages[page_number]
        return [token.layout.bounding_poly for token in page.tokens]

    def extract_text_heights_mm(self, document: documentai.Document) -> List[tuple]:
        """
        Extracts the height of each line of text from a Google Document AI parsed document
        and returns a list of heights in millimeters (mm).
        
        Parameters:
            document (google.cloud.documentai.Document): Parsed Document AI response object
        
        Returns:
            List of tuples: [(page_num, line_text, height_mm), ...]
        """
        heights = []
        pt_to_mm = 0.3528

        for page_num, page in enumerate(document.pages, start=1):
            page_height_pt = page.dimension.height  # e.g., 792 for US Letter

            for line in page.lines:
                layout = line.layout
                vertices = layout.bounding_poly.normalized_vertices

                y_coords = [v.y for v in vertices]
                if not y_coords:
                    continue

                height_ratio = max(y_coords) - min(y_coords)
                height_pt = height_ratio * page_height_pt
                height_mm = height_pt * pt_to_mm

                # Extract visible text (optional — may require mapping segments)
                text_segment = layout.text_anchor.text_segments[0]
                start = int(text_segment.start_index)
                end = int(text_segment.end_index)
                line_text = document.text[start:end].strip()

                heights.append((page_num, line_text, round(height_mm, 2)))

        return heights