Spaces:

Propelis
/

QC_Rules

Sleeping

File size: 9,930 Bytes

863cb78

import os
from typing import Optional, List, Dict, Any
from google.api_core.client_options import ClientOptions
from google.cloud import documentai  # type: ignore
from PIL import Image, ImageChops
from io import BytesIO
import fitz  # PyMuPDF
import base64

class GoogleDocumentAPI:
    def __init__(self, credentials_path: str):
        os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = credentials_path
        
        self.project_id = "649829115993"
        self.location = "us"  # Format is "us" or "eu"
        self.processor_id = "7f9fd758484d83fe"  # Only use this
        self.mime_type = "application/pdf"  # Refer to https://cloud.google.com/document-ai/docs/file-types for supported file types

    def process_document(self, file_path: str, field_mask: Optional[str] = None, processor_version_id: Optional[str] = None) -> documentai.Document:
        opts = ClientOptions(api_endpoint=f"{self.location}-documentai.googleapis.com")
        client = documentai.DocumentProcessorServiceClient(client_options=opts)

        if processor_version_id:
            name = client.processor_version_path(
                self.project_id, self.location, self.processor_id, processor_version_id
            )
        else:
            name = client.processor_path(self.project_id, self.location, self.processor_id)

        with open(file_path, "rb") as image:
            image_content = image.read()

        raw_document = documentai.RawDocument(content=image_content, mime_type=self.mime_type)

        process_options = documentai.ProcessOptions(
            individual_page_selector=documentai.ProcessOptions.IndividualPageSelector(
                pages=[1]
            )
        )

        request = documentai.ProcessRequest(
            name=name,
            raw_document=raw_document,
            field_mask=field_mask,
            process_options=process_options,
        )

        result = client.process_document(request=request)
        return result.document

    def get_document_text(self, document: documentai.Document, page_number: int = 0) -> str:
        # Note: document.pages is 0-indexed. If you request page 1, it will be in document.pages[0]
        return document.pages[page_number].text

    @staticmethod
    def _get_style_info(text_anchor: documentai.Document.TextAnchor, document: documentai.Document) -> str:
        """Helper function to extract style information for a text anchor."""
        if not hasattr(document, 'text_styles') or not document.text_styles:
            return "N/A"

        styles = []
        # A text anchor can have multiple non-contiguous segments.
        for para_segment in text_anchor.text_segments:
            para_start = int(para_segment.start_index)
            para_end = int(para_segment.end_index)

            for style in document.text_styles:
                for style_segment in style.text_anchor.text_segments:
                    style_start = int(style_segment.start_index)
                    style_end = int(style_segment.end_index)

                    # Check for overlap between the paragraph segment and the style segment
                    if max(para_start, style_start) < min(para_end, style_end):
                        style_str_parts = []
                        if style.font_size and style.font_size.size > 0:
                            unit = style.font_size.unit if style.font_size.unit else 'pt'
                            style_str_parts.append(f"font size: {round(style.font_size.size)}{unit}")
                        if style.font_weight and style.font_weight.lower() != 'normal':
                            style_str_parts.append(f"font weight: {style.font_weight}")
                        if style.text_style and style.text_style.lower() != 'normal':
                            style_str_parts.append(f"text style: {style.text_style}")
                        if style.font_family:
                            style_str_parts.append(f'font family: {style.font_family}')
                        
                        if style_str_parts:
                            styles.append(" ".join(style_str_parts))
        
        if styles:
            # Using dict.fromkeys to preserve order and get unique styles
            unique_styles = list(dict.fromkeys(styles))
            return ", ".join(unique_styles)
        
        return "default"

    @staticmethod
    def _get_text(text_anchor: documentai.Document.TextAnchor, text: str) -> str:
        """Helper function to extract text from text_anchor."""
        if not text_anchor.text_segments:
            return ""
        return "".join(
            text[int(segment.start_index) : int(segment.end_index)]
            for segment in text_anchor.text_segments
        )

    def extract_text_with_bounding_boxes(self, document: documentai.Document) -> List[Dict[str, Any]]:
        """
        Extracts text and bounding box for each paragraph in the document.
        
        Args:
            document: The processed documentai.Document object.
            
        Returns:
            A list of dictionaries, where each dictionary contains:
            - 'page_number': The page number (1-based).
            - 'text': The text of the paragraph.
            - 'bounding_box': A list of normalized vertices for the bounding box.
            - 'style': Style information for the text.
            - 'height': The height of the text block in millimeters (mm).
        """
        all_paragraphs = []
        full_text = document.text
        pt_to_mm = 0.3528  # Conversion factor from points to millimeters

        for page in document.pages:
            # Get page height in points for height calculation
            page_pts = page.dimension.height
            
            for paragraph in page.paragraphs:
                p_text = self._get_text(paragraph.layout.text_anchor, full_text)
                style_info = self._get_style_info(paragraph.layout.text_anchor, document)
                
                # Get the normalized vertices for the bounding box
                vertices = [
                    {"x": vertex.x, "y": vertex.y}
                    for vertex in paragraph.layout.bounding_poly.normalized_vertices
                ]
                
                # Calculate height in millimeters
                y_coords = [vertex.y for vertex in paragraph.layout.bounding_poly.normalized_vertices]
                height_ratio = max(y_coords) - min(y_coords)
                height_pt = height_ratio * page_pts
                height_mm = height_pt * pt_to_mm

                all_paragraphs.append({
                    "page_number": page.page_number,
                    "text": p_text.strip(), # Use .strip() to remove leading/trailing whitespace
                    "bounding_box": vertices,
                    "style": style_info,
                    "height": round(height_mm, 2)
                })
        return all_paragraphs
        



    def extract_text_with_markdown_table(self, document: documentai.Document) -> str:
        data = self.extract_text_with_bounding_boxes(document)
        return self._create_markdown_table(data)
    
    def _quantize_coord(self, val, grid_size=1000) -> int:
        """Converts a float (0-1) to an integer on a grid."""
        return int(val * grid_size)

    def _create_markdown_table(self, data) -> str:
        table = "| Text ID | X   | Y   | Text Height (mm) | Style | Text                                                           |\\n"
        table += "|----|-----|-----|--------|-------|-------------------------------------------------------------------------|\\n"
        for i, item in enumerate(data):
            top_left = item['bounding_box'][0]
            x = self._quantize_coord(top_left['x'])
            y = self._quantize_coord(top_left['y'])
            height = round(item.get('height', 0), 2)
            style = item.get('style', 'N/A')
            text = item['text'].replace('\\n', ' ').replace('|', '\\\\|').strip()
            table += f"| {i+1} | {x} | {y} | {height} | {style} | {text} |\\n"
        return table

    def get_bounding_boxes(self, document: documentai.Document, page_number: int = 0) -> list[documentai.BoundingPoly]:
        """
        Extracts bounding boxes for tokens on a specific page.
        """
        page = document.pages[page_number]
        return [token.layout.bounding_poly for token in page.tokens]

    def extract_text_heights_mm(self, document: documentai.Document) -> List[tuple]:
        """
        Extracts the height of each line of text from a Google Document AI parsed document
        and returns a list of heights in millimeters (mm).
        
        Parameters:
            document (google.cloud.documentai.Document): Parsed Document AI response object
        
        Returns:
            List of tuples: [(page_num, line_text, height_mm), ...]
        """
        heights = []
        pt_to_mm = 0.3528

        for page_num, page in enumerate(document.pages, start=1):
            page_height_pt = page.dimension.height  # e.g., 792 for US Letter

            for line in page.lines:
                layout = line.layout
                vertices = layout.bounding_poly.normalized_vertices

                y_coords = [v.y for v in vertices]
                if not y_coords:
                    continue

                height_ratio = max(y_coords) - min(y_coords)
                height_pt = height_ratio * page_height_pt
                height_mm = height_pt * pt_to_mm

                # Extract visible text (optional — may require mapping segments)
                text_segment = layout.text_anchor.text_segments[0]
                start = int(text_segment.start_index)
                end = int(text_segment.end_index)
                line_text = document.text[start:end].strip()

                heights.append((page_num, line_text, round(height_mm, 2)))

        return heights