File size: 7,516 Bytes
6c16992 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 | #!/usr/bin/env python3
"""
Test script for Google Document AI functionality.
This script demonstrates the text extraction with bounding boxes and height calculation.
"""
import os
import sys
from pathlib import Path
# Add the src directory to the path so we can import our modules
sys.path.append(str(Path(__file__).parent / "src"))
from extract_text.google_document_api import GoogleDocumentAPI
def test_google_doc_ai():
"""Test the Google Document AI functionality with a sample PDF."""
# Path to the credentials file
credentials_path = "src/extract_text/photon-services-f0d3ec1417d0.json"
# Path to a test PDF file
test_pdf_path = "requirements_library/client-requirements/Kir-Kat/kitkat-f1.pdf"
# Check if files exist
if not os.path.exists(credentials_path):
print(f"β Credentials file not found: {credentials_path}")
print("Please ensure the Google Cloud credentials file is in the correct location.")
return
if not os.path.exists(test_pdf_path):
print(f"β Test PDF file not found: {test_pdf_path}")
print("Please ensure the test PDF file exists.")
return
print("π Testing Google Document AI functionality...")
print(f"π Using PDF: {test_pdf_path}")
print(f"π Using credentials: {credentials_path}")
print("-" * 80)
try:
# Initialize the Google Document API
print("1. Initializing Google Document API...")
doc_api = GoogleDocumentAPI(credentials_path)
print("β
Google Document API initialized successfully")
# Process the document
print("\n2. Processing document...")
document = doc_api.process_document(test_pdf_path)
print("β
Document processed successfully")
# Get basic text
print("\n3. Extracting basic text...")
basic_text = doc_api.get_document_text(document, page_number=0)
print(f"π Basic text length: {len(basic_text)} characters")
print(f"π First 200 characters: {basic_text[:200]}...")
# Extract text with bounding boxes and height
print("\n4. Extracting text with bounding boxes and height...")
text_blocks = doc_api.extract_text_with_bounding_boxes(document)
print(f"π Found {len(text_blocks)} text blocks")
# Display sample text blocks
print("\n5. Sample text blocks with height information:")
print("-" * 80)
for i, block in enumerate(text_blocks[:10]): # Show first 10 blocks
print(f"Block {i+1}:")
print(f" Page: {block['page_number']}")
print(f" Height: {block['height']:.2f} mm")
print(f" Style: {block['style']}")
print(f" Text: {block['text'][:100]}{'...' if len(block['text']) > 100 else ''}")
print(f" Bounding Box: {block['bounding_box']}")
print()
# Generate markdown table
print("\n6. Generating markdown table...")
markdown_table = doc_api.extract_text_with_markdown_table(document)
print("π Markdown table generated successfully")
# Test the new extract_text_heights_mm function
print("\n7. Testing extract_text_heights_mm function...")
heights_mm = doc_api.extract_text_heights_mm(document)
print(f"π Found {len(heights_mm)} lines with height in mm")
# Display sample heights
print("\nπ Sample line heights (mm):")
print("-" * 60)
for i, (page_num, line_text, height_mm) in enumerate(heights_mm[:10]):
print(f"Line {i+1}: Page {page_num}, Height={height_mm}mm | Text: {line_text[:50]}...")
# Save results to files
print("\n8. Saving results to files...")
# Save raw text blocks
with open("test_results_text_blocks.txt", "w", encoding="utf-8") as f:
f.write("Text Blocks with Height Information:\n")
f.write("=" * 50 + "\n\n")
for i, block in enumerate(text_blocks):
f.write(f"Block {i+1}:\n")
f.write(f" Page: {block['page_number']}\n")
f.write(f" Height: {block['height']:.2f} mm\n")
f.write(f" Style: {block['style']}\n")
f.write(f" Text: {block['text']}\n")
f.write(f" Bounding Box: {block['bounding_box']}\n")
f.write("-" * 40 + "\n")
# Save markdown table
with open("test_results_markdown_table.md", "w", encoding="utf-8") as f:
f.write("# Google Document AI Results\n\n")
f.write("## Text Blocks with Height Information\n\n")
f.write(markdown_table)
# Save basic text
with open("test_results_basic_text.txt", "w", encoding="utf-8") as f:
f.write("Basic Extracted Text:\n")
f.write("=" * 30 + "\n\n")
f.write(basic_text)
print("β
Results saved to:")
print(" - test_results_text_blocks.txt")
print(" - test_results_markdown_table.md")
print(" - test_results_basic_text.txt")
# Save heights data
with open("test_results_heights_mm.txt", "w", encoding="utf-8") as f:
f.write("Line Heights in Millimeters:\n")
f.write("=" * 40 + "\n\n")
for i, (page_num, line_text, height_mm) in enumerate(heights_mm):
f.write(f"Line {i+1}: Page {page_num}, Height={height_mm}mm\n")
f.write(f"Text: {line_text}\n")
f.write("-" * 40 + "\n")
print(" - test_results_heights_mm.txt")
# Display statistics
print("\n9. Statistics:")
print("-" * 30)
heights = [block['height'] for block in text_blocks]
if heights:
print(f"π Height statistics:")
print(f" Min height: {min(heights):.2f} mm")
print(f" Max height: {max(heights):.2f} mm")
print(f" Average height: {sum(heights)/len(heights):.2f} mm")
# Count styles
styles = {}
for block in text_blocks:
style = block['style']
styles[style] = styles.get(style, 0) + 1
print(f"\nπ¨ Style distribution:")
for style, count in sorted(styles.items(), key=lambda x: x[1], reverse=True):
print(f" {style}: {count} blocks")
print("\nπ Test completed successfully!")
except Exception as e:
print(f"β Error during testing: {str(e)}")
import traceback
traceback.print_exc()
def display_markdown_preview():
"""Display a preview of the generated markdown table."""
try:
with open("test_results_markdown_table.md", "r", encoding="utf-8") as f:
content = f.read()
print("\nπ Markdown Table Preview:")
print("=" * 80)
print(content)
except FileNotFoundError:
print("β Markdown table file not found. Run the test first.")
if __name__ == "__main__":
print("π Google Document AI Test Script")
print("=" * 50)
# Run the main test
test_google_doc_ai()
# Display markdown preview
display_markdown_preview() |