File size: 7,516 Bytes
6c16992
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
#!/usr/bin/env python3
"""

Test script for Google Document AI functionality.

This script demonstrates the text extraction with bounding boxes and height calculation.

"""

import os
import sys
from pathlib import Path

# Add the src directory to the path so we can import our modules
sys.path.append(str(Path(__file__).parent / "src"))

from extract_text.google_document_api import GoogleDocumentAPI

def test_google_doc_ai():
    """Test the Google Document AI functionality with a sample PDF."""
    
    # Path to the credentials file
    credentials_path = "src/extract_text/photon-services-f0d3ec1417d0.json"
    
    # Path to a test PDF file
    test_pdf_path = "requirements_library/client-requirements/Kir-Kat/kitkat-f1.pdf"
    
    # Check if files exist
    if not os.path.exists(credentials_path):
        print(f"❌ Credentials file not found: {credentials_path}")
        print("Please ensure the Google Cloud credentials file is in the correct location.")
        return
    
    if not os.path.exists(test_pdf_path):
        print(f"❌ Test PDF file not found: {test_pdf_path}")
        print("Please ensure the test PDF file exists.")
        return
    
    print("πŸ” Testing Google Document AI functionality...")
    print(f"πŸ“„ Using PDF: {test_pdf_path}")
    print(f"πŸ”‘ Using credentials: {credentials_path}")
    print("-" * 80)
    
    try:
        # Initialize the Google Document API
        print("1. Initializing Google Document API...")
        doc_api = GoogleDocumentAPI(credentials_path)
        print("βœ… Google Document API initialized successfully")
        
        # Process the document
        print("\n2. Processing document...")
        document = doc_api.process_document(test_pdf_path)
        print("βœ… Document processed successfully")
        
        # Get basic text
        print("\n3. Extracting basic text...")
        basic_text = doc_api.get_document_text(document, page_number=0)
        print(f"πŸ“ Basic text length: {len(basic_text)} characters")
        print(f"πŸ“ First 200 characters: {basic_text[:200]}...")
        
        # Extract text with bounding boxes and height
        print("\n4. Extracting text with bounding boxes and height...")
        text_blocks = doc_api.extract_text_with_bounding_boxes(document)
        print(f"πŸ“Š Found {len(text_blocks)} text blocks")
        
        # Display sample text blocks
        print("\n5. Sample text blocks with height information:")
        print("-" * 80)
        for i, block in enumerate(text_blocks[:10]):  # Show first 10 blocks
            print(f"Block {i+1}:")
            print(f"  Page: {block['page_number']}")
            print(f"  Height: {block['height']:.2f} mm")
            print(f"  Style: {block['style']}")
            print(f"  Text: {block['text'][:100]}{'...' if len(block['text']) > 100 else ''}")
            print(f"  Bounding Box: {block['bounding_box']}")
            print()
        
        # Generate markdown table
        print("\n6. Generating markdown table...")
        markdown_table = doc_api.extract_text_with_markdown_table(document)
        print("πŸ“‹ Markdown table generated successfully")
        
        # Test the new extract_text_heights_mm function
        print("\n7. Testing extract_text_heights_mm function...")
        heights_mm = doc_api.extract_text_heights_mm(document)
        print(f"πŸ“ Found {len(heights_mm)} lines with height in mm")
        
        # Display sample heights
        print("\nπŸ“ Sample line heights (mm):")
        print("-" * 60)
        for i, (page_num, line_text, height_mm) in enumerate(heights_mm[:10]):
            print(f"Line {i+1}: Page {page_num}, Height={height_mm}mm | Text: {line_text[:50]}...")
        
        # Save results to files
        print("\n8. Saving results to files...")
        
        # Save raw text blocks
        with open("test_results_text_blocks.txt", "w", encoding="utf-8") as f:
            f.write("Text Blocks with Height Information:\n")
            f.write("=" * 50 + "\n\n")
            for i, block in enumerate(text_blocks):
                f.write(f"Block {i+1}:\n")
                f.write(f"  Page: {block['page_number']}\n")
                f.write(f"  Height: {block['height']:.2f} mm\n")
                f.write(f"  Style: {block['style']}\n")
                f.write(f"  Text: {block['text']}\n")
                f.write(f"  Bounding Box: {block['bounding_box']}\n")
                f.write("-" * 40 + "\n")
        
        # Save markdown table
        with open("test_results_markdown_table.md", "w", encoding="utf-8") as f:
            f.write("# Google Document AI Results\n\n")
            f.write("## Text Blocks with Height Information\n\n")
            f.write(markdown_table)
        
        # Save basic text
        with open("test_results_basic_text.txt", "w", encoding="utf-8") as f:
            f.write("Basic Extracted Text:\n")
            f.write("=" * 30 + "\n\n")
            f.write(basic_text)
        
        print("βœ… Results saved to:")
        print("   - test_results_text_blocks.txt")
        print("   - test_results_markdown_table.md")
        print("   - test_results_basic_text.txt")
        
        # Save heights data
        with open("test_results_heights_mm.txt", "w", encoding="utf-8") as f:
            f.write("Line Heights in Millimeters:\n")
            f.write("=" * 40 + "\n\n")
            for i, (page_num, line_text, height_mm) in enumerate(heights_mm):
                f.write(f"Line {i+1}: Page {page_num}, Height={height_mm}mm\n")
                f.write(f"Text: {line_text}\n")
                f.write("-" * 40 + "\n")
        
        print("   - test_results_heights_mm.txt")
        
        # Display statistics
        print("\n9. Statistics:")
        print("-" * 30)
        heights = [block['height'] for block in text_blocks]
        if heights:
            print(f"πŸ“ Height statistics:")
            print(f"   Min height: {min(heights):.2f} mm")
            print(f"   Max height: {max(heights):.2f} mm")
            print(f"   Average height: {sum(heights)/len(heights):.2f} mm")
        
        # Count styles
        styles = {}
        for block in text_blocks:
            style = block['style']
            styles[style] = styles.get(style, 0) + 1
        
        print(f"\n🎨 Style distribution:")
        for style, count in sorted(styles.items(), key=lambda x: x[1], reverse=True):
            print(f"   {style}: {count} blocks")
        
        print("\nπŸŽ‰ Test completed successfully!")
        
    except Exception as e:
        print(f"❌ Error during testing: {str(e)}")
        import traceback
        traceback.print_exc()

def display_markdown_preview():
    """Display a preview of the generated markdown table."""
    try:
        with open("test_results_markdown_table.md", "r", encoding="utf-8") as f:
            content = f.read()
        
        print("\nπŸ“‹ Markdown Table Preview:")
        print("=" * 80)
        print(content)
        
    except FileNotFoundError:
        print("❌ Markdown table file not found. Run the test first.")

if __name__ == "__main__":
    print("πŸš€ Google Document AI Test Script")
    print("=" * 50)
    
    # Run the main test
    test_google_doc_ai()
    
    # Display markdown preview
    display_markdown_preview()