| |
|
| | """
|
| | Test script for metadata extraction functionality
|
| | """
|
| | import os
|
| | import sys
|
| | from src.extract_text.extract_meta_data import PDFArtworkMetadataExtractor
|
| |
|
| | def test_metadata_extraction():
|
| | """Test the metadata extraction on a sample PDF"""
|
| |
|
| |
|
| | base_path = "requirements_library/client-requirements"
|
| |
|
| | if not os.path.exists(base_path):
|
| | print("β No requirements library found")
|
| | return False
|
| |
|
| |
|
| | pdf_file = None
|
| | for root, dirs, files in os.walk(base_path):
|
| | for file in files:
|
| | if file.lower().endswith('.pdf'):
|
| | pdf_file = os.path.join(root, file)
|
| | break
|
| | if pdf_file:
|
| | break
|
| |
|
| | if not pdf_file:
|
| | print("β No PDF files found in requirements library")
|
| | return False
|
| |
|
| | print(f"π Testing metadata extraction on: {pdf_file}")
|
| |
|
| | try:
|
| |
|
| | extractor = PDFArtworkMetadataExtractor()
|
| |
|
| |
|
| | metadata = extractor.extract_metadata(pdf_file)
|
| |
|
| | if 'error' in metadata:
|
| | print(f"β Error extracting metadata: {metadata['error']}")
|
| | return False
|
| |
|
| |
|
| | print("β
Metadata extraction successful!")
|
| | print(f"π Pages processed: {metadata.get('pages_processed', 0)}")
|
| | print(f"π Has selectable text: {metadata.get('has_selectable_text', False)}")
|
| | print(f"π§ Extraction method: {metadata.get('extraction_method', 'unknown')}")
|
| |
|
| |
|
| | fonts = metadata.get('fonts', {})
|
| | if fonts:
|
| | print("\nπ€ Top 3 Fonts:")
|
| | for i, (font, count) in enumerate(list(fonts.items())[:3]):
|
| | print(f" {i+1}. {font}: {count} characters")
|
| |
|
| |
|
| | font_sizes = metadata.get('font_sizes', {})
|
| | if font_sizes:
|
| | print("\nπ Top 3 Font Sizes:")
|
| | for i, (size, count) in enumerate(list(font_sizes.items())[:3]):
|
| | print(f" {i+1}. {size}pt: {count} characters")
|
| |
|
| |
|
| | colors = metadata.get('text_colors', {})
|
| | if colors:
|
| | print("\nπ¨ Top 3 Text Colors:")
|
| | for i, (color, count) in enumerate(list(colors.items())[:3]):
|
| | print(f" {i+1}. RGB{color}: {count} characters")
|
| |
|
| | return True
|
| |
|
| | except Exception as e:
|
| | print(f"β Test failed with error: {str(e)}")
|
| | return False
|
| |
|
| | if __name__ == "__main__":
|
| | print("π§ͺ Testing Metadata Extraction")
|
| | print("=" * 40)
|
| |
|
| | success = test_metadata_extraction()
|
| |
|
| | if success:
|
| | print("\nβ
All tests passed! Metadata extraction is working correctly.")
|
| | else:
|
| | print("\nβ Tests failed. Please check the error messages above.")
|
| | sys.exit(1) |