| |
|
| | """
|
| | Test script for Google Document AI functionality.
|
| | This script demonstrates the text extraction with bounding boxes and height calculation.
|
| | """
|
| |
|
| | import os
|
| | import sys
|
| | from pathlib import Path
|
| |
|
| |
|
| | sys.path.append(str(Path(__file__).parent / "src"))
|
| |
|
| | from extract_text.google_document_api import GoogleDocumentAPI
|
| |
|
| | def test_google_doc_ai():
|
| | """Test the Google Document AI functionality with a sample PDF."""
|
| |
|
| |
|
| | credentials_path = "src/extract_text/photon-services-f0d3ec1417d0.json"
|
| |
|
| |
|
| | test_pdf_path = "requirements_library/client-requirements/Kir-Kat/kitkat-f1.pdf"
|
| |
|
| |
|
| | if not os.path.exists(credentials_path):
|
| | print(f"β Credentials file not found: {credentials_path}")
|
| | print("Please ensure the Google Cloud credentials file is in the correct location.")
|
| | return
|
| |
|
| | if not os.path.exists(test_pdf_path):
|
| | print(f"β Test PDF file not found: {test_pdf_path}")
|
| | print("Please ensure the test PDF file exists.")
|
| | return
|
| |
|
| | print("π Testing Google Document AI functionality...")
|
| | print(f"π Using PDF: {test_pdf_path}")
|
| | print(f"π Using credentials: {credentials_path}")
|
| | print("-" * 80)
|
| |
|
| | try:
|
| |
|
| | print("1. Initializing Google Document API...")
|
| | doc_api = GoogleDocumentAPI(credentials_path)
|
| | print("β
Google Document API initialized successfully")
|
| |
|
| |
|
| | print("\n2. Processing document...")
|
| | document = doc_api.process_document(test_pdf_path)
|
| | print("β
Document processed successfully")
|
| |
|
| |
|
| | print("\n3. Extracting basic text...")
|
| | basic_text = doc_api.get_document_text(document, page_number=0)
|
| | print(f"π Basic text length: {len(basic_text)} characters")
|
| | print(f"π First 200 characters: {basic_text[:200]}...")
|
| |
|
| |
|
| | print("\n4. Extracting text with bounding boxes and height...")
|
| | text_blocks = doc_api.extract_text_with_bounding_boxes(document)
|
| | print(f"π Found {len(text_blocks)} text blocks")
|
| |
|
| |
|
| | print("\n5. Sample text blocks with height information:")
|
| | print("-" * 80)
|
| | for i, block in enumerate(text_blocks[:10]):
|
| | print(f"Block {i+1}:")
|
| | print(f" Page: {block['page_number']}")
|
| | print(f" Height: {block['height']:.2f} mm")
|
| | print(f" Style: {block['style']}")
|
| | print(f" Text: {block['text'][:100]}{'...' if len(block['text']) > 100 else ''}")
|
| | print(f" Bounding Box: {block['bounding_box']}")
|
| | print()
|
| |
|
| |
|
| | print("\n6. Generating markdown table...")
|
| | markdown_table = doc_api.extract_text_with_markdown_table(document)
|
| | print("π Markdown table generated successfully")
|
| |
|
| |
|
| | print("\n7. Testing extract_text_heights_mm function...")
|
| | heights_mm = doc_api.extract_text_heights_mm(document)
|
| | print(f"π Found {len(heights_mm)} lines with height in mm")
|
| |
|
| |
|
| | print("\nπ Sample line heights (mm):")
|
| | print("-" * 60)
|
| | for i, (page_num, line_text, height_mm) in enumerate(heights_mm[:10]):
|
| | print(f"Line {i+1}: Page {page_num}, Height={height_mm}mm | Text: {line_text[:50]}...")
|
| |
|
| |
|
| | print("\n8. Saving results to files...")
|
| |
|
| |
|
| | with open("test_results_text_blocks.txt", "w", encoding="utf-8") as f:
|
| | f.write("Text Blocks with Height Information:\n")
|
| | f.write("=" * 50 + "\n\n")
|
| | for i, block in enumerate(text_blocks):
|
| | f.write(f"Block {i+1}:\n")
|
| | f.write(f" Page: {block['page_number']}\n")
|
| | f.write(f" Height: {block['height']:.2f} mm\n")
|
| | f.write(f" Style: {block['style']}\n")
|
| | f.write(f" Text: {block['text']}\n")
|
| | f.write(f" Bounding Box: {block['bounding_box']}\n")
|
| | f.write("-" * 40 + "\n")
|
| |
|
| |
|
| | with open("test_results_markdown_table.md", "w", encoding="utf-8") as f:
|
| | f.write("# Google Document AI Results\n\n")
|
| | f.write("## Text Blocks with Height Information\n\n")
|
| | f.write(markdown_table)
|
| |
|
| |
|
| | with open("test_results_basic_text.txt", "w", encoding="utf-8") as f:
|
| | f.write("Basic Extracted Text:\n")
|
| | f.write("=" * 30 + "\n\n")
|
| | f.write(basic_text)
|
| |
|
| | print("β
Results saved to:")
|
| | print(" - test_results_text_blocks.txt")
|
| | print(" - test_results_markdown_table.md")
|
| | print(" - test_results_basic_text.txt")
|
| |
|
| |
|
| | with open("test_results_heights_mm.txt", "w", encoding="utf-8") as f:
|
| | f.write("Line Heights in Millimeters:\n")
|
| | f.write("=" * 40 + "\n\n")
|
| | for i, (page_num, line_text, height_mm) in enumerate(heights_mm):
|
| | f.write(f"Line {i+1}: Page {page_num}, Height={height_mm}mm\n")
|
| | f.write(f"Text: {line_text}\n")
|
| | f.write("-" * 40 + "\n")
|
| |
|
| | print(" - test_results_heights_mm.txt")
|
| |
|
| |
|
| | print("\n9. Statistics:")
|
| | print("-" * 30)
|
| | heights = [block['height'] for block in text_blocks]
|
| | if heights:
|
| | print(f"π Height statistics:")
|
| | print(f" Min height: {min(heights):.2f} mm")
|
| | print(f" Max height: {max(heights):.2f} mm")
|
| | print(f" Average height: {sum(heights)/len(heights):.2f} mm")
|
| |
|
| |
|
| | styles = {}
|
| | for block in text_blocks:
|
| | style = block['style']
|
| | styles[style] = styles.get(style, 0) + 1
|
| |
|
| | print(f"\nπ¨ Style distribution:")
|
| | for style, count in sorted(styles.items(), key=lambda x: x[1], reverse=True):
|
| | print(f" {style}: {count} blocks")
|
| |
|
| | print("\nπ Test completed successfully!")
|
| |
|
| | except Exception as e:
|
| | print(f"β Error during testing: {str(e)}")
|
| | import traceback
|
| | traceback.print_exc()
|
| |
|
| | def display_markdown_preview():
|
| | """Display a preview of the generated markdown table."""
|
| | try:
|
| | with open("test_results_markdown_table.md", "r", encoding="utf-8") as f:
|
| | content = f.read()
|
| |
|
| | print("\nπ Markdown Table Preview:")
|
| | print("=" * 80)
|
| | print(content)
|
| |
|
| | except FileNotFoundError:
|
| | print("β Markdown table file not found. Run the test first.")
|
| |
|
| | if __name__ == "__main__":
|
| | print("π Google Document AI Test Script")
|
| | print("=" * 50)
|
| |
|
| |
|
| | test_google_doc_ai()
|
| |
|
| |
|
| | display_markdown_preview() |