#!/usr/bin/env python3 """ Test script for Google Document AI functionality. This script demonstrates the text extraction with bounding boxes and height calculation. """ import os import sys from pathlib import Path # Add the src directory to the path so we can import our modules sys.path.append(str(Path(__file__).parent / "src")) from extract_text.google_document_api import GoogleDocumentAPI def test_google_doc_ai(): """Test the Google Document AI functionality with a sample PDF.""" # Path to the credentials file credentials_path = "src/extract_text/photon-services-f0d3ec1417d0.json" # Path to a test PDF file test_pdf_path = "requirements_library/client-requirements/Kir-Kat/kitkat-f1.pdf" # Check if files exist if not os.path.exists(credentials_path): print(f"āŒ Credentials file not found: {credentials_path}") print("Please ensure the Google Cloud credentials file is in the correct location.") return if not os.path.exists(test_pdf_path): print(f"āŒ Test PDF file not found: {test_pdf_path}") print("Please ensure the test PDF file exists.") return print("šŸ” Testing Google Document AI functionality...") print(f"šŸ“„ Using PDF: {test_pdf_path}") print(f"šŸ”‘ Using credentials: {credentials_path}") print("-" * 80) try: # Initialize the Google Document API print("1. Initializing Google Document API...") doc_api = GoogleDocumentAPI(credentials_path) print("āœ… Google Document API initialized successfully") # Process the document print("\n2. Processing document...") document = doc_api.process_document(test_pdf_path) print("āœ… Document processed successfully") # Get basic text print("\n3. Extracting basic text...") basic_text = doc_api.get_document_text(document, page_number=0) print(f"šŸ“ Basic text length: {len(basic_text)} characters") print(f"šŸ“ First 200 characters: {basic_text[:200]}...") # Extract text with bounding boxes and height print("\n4. Extracting text with bounding boxes and height...") text_blocks = doc_api.extract_text_with_bounding_boxes(document) print(f"šŸ“Š Found {len(text_blocks)} text blocks") # Display sample text blocks print("\n5. Sample text blocks with height information:") print("-" * 80) for i, block in enumerate(text_blocks[:10]): # Show first 10 blocks print(f"Block {i+1}:") print(f" Page: {block['page_number']}") print(f" Height: {block['height']:.2f} mm") print(f" Style: {block['style']}") print(f" Text: {block['text'][:100]}{'...' if len(block['text']) > 100 else ''}") print(f" Bounding Box: {block['bounding_box']}") print() # Generate markdown table print("\n6. Generating markdown table...") markdown_table = doc_api.extract_text_with_markdown_table(document) print("šŸ“‹ Markdown table generated successfully") # Test the new extract_text_heights_mm function print("\n7. Testing extract_text_heights_mm function...") heights_mm = doc_api.extract_text_heights_mm(document) print(f"šŸ“ Found {len(heights_mm)} lines with height in mm") # Display sample heights print("\nšŸ“ Sample line heights (mm):") print("-" * 60) for i, (page_num, line_text, height_mm) in enumerate(heights_mm[:10]): print(f"Line {i+1}: Page {page_num}, Height={height_mm}mm | Text: {line_text[:50]}...") # Save results to files print("\n8. Saving results to files...") # Save raw text blocks with open("test_results_text_blocks.txt", "w", encoding="utf-8") as f: f.write("Text Blocks with Height Information:\n") f.write("=" * 50 + "\n\n") for i, block in enumerate(text_blocks): f.write(f"Block {i+1}:\n") f.write(f" Page: {block['page_number']}\n") f.write(f" Height: {block['height']:.2f} mm\n") f.write(f" Style: {block['style']}\n") f.write(f" Text: {block['text']}\n") f.write(f" Bounding Box: {block['bounding_box']}\n") f.write("-" * 40 + "\n") # Save markdown table with open("test_results_markdown_table.md", "w", encoding="utf-8") as f: f.write("# Google Document AI Results\n\n") f.write("## Text Blocks with Height Information\n\n") f.write(markdown_table) # Save basic text with open("test_results_basic_text.txt", "w", encoding="utf-8") as f: f.write("Basic Extracted Text:\n") f.write("=" * 30 + "\n\n") f.write(basic_text) print("āœ… Results saved to:") print(" - test_results_text_blocks.txt") print(" - test_results_markdown_table.md") print(" - test_results_basic_text.txt") # Save heights data with open("test_results_heights_mm.txt", "w", encoding="utf-8") as f: f.write("Line Heights in Millimeters:\n") f.write("=" * 40 + "\n\n") for i, (page_num, line_text, height_mm) in enumerate(heights_mm): f.write(f"Line {i+1}: Page {page_num}, Height={height_mm}mm\n") f.write(f"Text: {line_text}\n") f.write("-" * 40 + "\n") print(" - test_results_heights_mm.txt") # Display statistics print("\n9. Statistics:") print("-" * 30) heights = [block['height'] for block in text_blocks] if heights: print(f"šŸ“ Height statistics:") print(f" Min height: {min(heights):.2f} mm") print(f" Max height: {max(heights):.2f} mm") print(f" Average height: {sum(heights)/len(heights):.2f} mm") # Count styles styles = {} for block in text_blocks: style = block['style'] styles[style] = styles.get(style, 0) + 1 print(f"\nšŸŽØ Style distribution:") for style, count in sorted(styles.items(), key=lambda x: x[1], reverse=True): print(f" {style}: {count} blocks") print("\nšŸŽ‰ Test completed successfully!") except Exception as e: print(f"āŒ Error during testing: {str(e)}") import traceback traceback.print_exc() def display_markdown_preview(): """Display a preview of the generated markdown table.""" try: with open("test_results_markdown_table.md", "r", encoding="utf-8") as f: content = f.read() print("\nšŸ“‹ Markdown Table Preview:") print("=" * 80) print(content) except FileNotFoundError: print("āŒ Markdown table file not found. Run the test first.") if __name__ == "__main__": print("šŸš€ Google Document AI Test Script") print("=" * 50) # Run the main test test_google_doc_ai() # Display markdown preview display_markdown_preview()