| | from __future__ import annotations |
| |
|
| | import glob |
| | import osa |
| | import re |
| | import textwrap |
| |
|
| | from google.api_core.client_options import ClientOptions |
| | from google.cloud import documentai |
| | import pandas as pd |
| | import os |
| |
|
| | def create_processor( |
| | project_id: str, location: str, processor_display_name: str |
| | ) -> documentai.Processor: |
| | client = documentai.DocumentProcessorServiceClient(client_options=client_options) |
| |
|
| | |
| | |
| | parent = client.common_location_path(project_id, location) |
| |
|
| | |
| | return client.create_processor( |
| | parent=parent, |
| | processor=documentai.Processor( |
| | display_name=processor_display_name, type_="OCR_PROCESSOR" |
| | ), |
| | ) |
| |
|
| | def process_document( |
| | processor_name: str, |
| | file_path: str, |
| | ) -> documentai.Document: |
| | client = documentai.DocumentProcessorServiceClient(client_options=client_options) |
| |
|
| | |
| | with open(file_path, "rb") as image: |
| | image_content = image.read() |
| |
|
| | |
| | raw_document = documentai.RawDocument( |
| | content=image_content, mime_type="application/pdf" |
| | ) |
| |
|
| | |
| | request = documentai.ProcessRequest(name=processor_name, raw_document=raw_document) |
| |
|
| | result = client.process_document(request=request) |
| |
|
| | return result.document |
| |
|
| | def layout_to_text(layout: documentai.Document.Page.Layout, text: str) -> str: |
| | """ |
| | Document AI identifies text in different parts of the document by their |
| | offsets in the entirety of the document"s text. This function converts |
| | offsets to a string. |
| | """ |
| | |
| | |
| | return "".join( |
| | text[int(segment.start_index) : int(segment.end_index)] |
| | for segment in layout.text_anchor.text_segments |
| | ) |
| |
|
| | def pdf_processor(processor_name: str, extracted_data) -> list[dict]: |
| | |
| | for path in glob.glob("docs/*.pdf"): |
| | |
| | file_name, file_type = os.path.splitext(path) |
| | |
| | print(f"Processing {file_name}") |
| | |
| | document = process_document(processor_name, file_path=path) |
| |
|
| | if not document: |
| | print("Processing did not complete successfully.") |
| | continue |
| |
|
| | |
| | document_chunks = [ |
| | layout_to_text(paragraph.layout, document.text) |
| | for page in document.pages |
| | for paragraph in page.paragraphs |
| | ] |
| |
|
| | |
| | |
| | |
| |
|
| | |
| | for chunk_number, chunk_content in enumerate(document_chunks, start=1): |
| | |
| | extracted_data.append( |
| | { |
| | "file_name": file_name, |
| | "file_type": file_type, |
| | "chunk_number": chunk_number, |
| | "content": chunk_content, |
| | } |
| | ) |
| | return extracted_data |
| |
|
| |
|
| | if __name__ == "__main__": |
| | project_id = "iglintdb" |
| | location = "us" |
| | processor_display_name = "knowledge-base-ocr-processor-test-1" |
| | client_options = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com") |
| | processor = create_processor(project_id, location, processor_display_name) |
| |
|
| | |
| | processor_name = processor.name |
| | chunk_size = 5000 |
| | extracted_data: list[dict] = [] |
| |
|
| | extracted_data = pdf_processor(processor_name, extracted_data) |
| |
|
| | |
| | pdf_data = ( |
| | pd.DataFrame.from_dict(extracted_data) |
| | .sort_values(by=["file_name"]) |
| | .reset_index(drop=True) |
| | ) |
| |
|
| | pdf_data.head() |
| |
|
| | pdf_data.to_csv("doc_ai/pdf_data.csv", index=False) |
| |
|
| |
|
| |
|
| |
|
| |
|