File size: 3,087 Bytes
6c16992
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
#!/usr/bin/env python3
"""

Test script for metadata extraction functionality

"""
import os
import sys
from src.extract_text.extract_meta_data import PDFArtworkMetadataExtractor

def test_metadata_extraction():
    """Test the metadata extraction on a sample PDF"""
    
    # Check if we have any PDF files in the requirements library
    base_path = "requirements_library/client-requirements"
    
    if not os.path.exists(base_path):
        print("❌ No requirements library found")
        return False
    
    # Find the first PDF file
    pdf_file = None
    for root, dirs, files in os.walk(base_path):
        for file in files:
            if file.lower().endswith('.pdf'):
                pdf_file = os.path.join(root, file)
                break
        if pdf_file:
            break
    
    if not pdf_file:
        print("❌ No PDF files found in requirements library")
        return False
    
    print(f"πŸ“„ Testing metadata extraction on: {pdf_file}")
    
    try:
        # Initialize the extractor
        extractor = PDFArtworkMetadataExtractor()
        
        # Extract metadata
        metadata = extractor.extract_metadata(pdf_file)
        
        if 'error' in metadata:
            print(f"❌ Error extracting metadata: {metadata['error']}")
            return False
        
        # Print results
        print("βœ… Metadata extraction successful!")
        print(f"πŸ“Š Pages processed: {metadata.get('pages_processed', 0)}")
        print(f"πŸ“ Has selectable text: {metadata.get('has_selectable_text', False)}")
        print(f"πŸ”§ Extraction method: {metadata.get('extraction_method', 'unknown')}")
        
        # Show top fonts
        fonts = metadata.get('fonts', {})
        if fonts:
            print("\nπŸ”€ Top 3 Fonts:")
            for i, (font, count) in enumerate(list(fonts.items())[:3]):
                print(f"  {i+1}. {font}: {count} characters")
        
        # Show top font sizes
        font_sizes = metadata.get('font_sizes', {})
        if font_sizes:
            print("\nπŸ“ Top 3 Font Sizes:")
            for i, (size, count) in enumerate(list(font_sizes.items())[:3]):
                print(f"  {i+1}. {size}pt: {count} characters")
        
        # Show top colors
        colors = metadata.get('text_colors', {})
        if colors:
            print("\n🎨 Top 3 Text Colors:")
            for i, (color, count) in enumerate(list(colors.items())[:3]):
                print(f"  {i+1}. RGB{color}: {count} characters")
        
        return True
        
    except Exception as e:
        print(f"❌ Test failed with error: {str(e)}")
        return False

if __name__ == "__main__":
    print("πŸ§ͺ Testing Metadata Extraction")
    print("=" * 40)
    
    success = test_metadata_extraction()
    
    if success:
        print("\nβœ… All tests passed! Metadata extraction is working correctly.")
    else:
        print("\n❌ Tests failed. Please check the error messages above.")
        sys.exit(1)