QC_Rules / test_metadata.py
Jakecole1's picture
Upload 11 files
6c16992 verified
#!/usr/bin/env python3
"""
Test script for metadata extraction functionality
"""
import os
import sys
from src.extract_text.extract_meta_data import PDFArtworkMetadataExtractor
def test_metadata_extraction():
"""Test the metadata extraction on a sample PDF"""
# Check if we have any PDF files in the requirements library
base_path = "requirements_library/client-requirements"
if not os.path.exists(base_path):
print("❌ No requirements library found")
return False
# Find the first PDF file
pdf_file = None
for root, dirs, files in os.walk(base_path):
for file in files:
if file.lower().endswith('.pdf'):
pdf_file = os.path.join(root, file)
break
if pdf_file:
break
if not pdf_file:
print("❌ No PDF files found in requirements library")
return False
print(f"πŸ“„ Testing metadata extraction on: {pdf_file}")
try:
# Initialize the extractor
extractor = PDFArtworkMetadataExtractor()
# Extract metadata
metadata = extractor.extract_metadata(pdf_file)
if 'error' in metadata:
print(f"❌ Error extracting metadata: {metadata['error']}")
return False
# Print results
print("βœ… Metadata extraction successful!")
print(f"πŸ“Š Pages processed: {metadata.get('pages_processed', 0)}")
print(f"πŸ“ Has selectable text: {metadata.get('has_selectable_text', False)}")
print(f"πŸ”§ Extraction method: {metadata.get('extraction_method', 'unknown')}")
# Show top fonts
fonts = metadata.get('fonts', {})
if fonts:
print("\nπŸ”€ Top 3 Fonts:")
for i, (font, count) in enumerate(list(fonts.items())[:3]):
print(f" {i+1}. {font}: {count} characters")
# Show top font sizes
font_sizes = metadata.get('font_sizes', {})
if font_sizes:
print("\nπŸ“ Top 3 Font Sizes:")
for i, (size, count) in enumerate(list(font_sizes.items())[:3]):
print(f" {i+1}. {size}pt: {count} characters")
# Show top colors
colors = metadata.get('text_colors', {})
if colors:
print("\n🎨 Top 3 Text Colors:")
for i, (color, count) in enumerate(list(colors.items())[:3]):
print(f" {i+1}. RGB{color}: {count} characters")
return True
except Exception as e:
print(f"❌ Test failed with error: {str(e)}")
return False
if __name__ == "__main__":
print("πŸ§ͺ Testing Metadata Extraction")
print("=" * 40)
success = test_metadata_extraction()
if success:
print("\nβœ… All tests passed! Metadata extraction is working correctly.")
else:
print("\n❌ Tests failed. Please check the error messages above.")
sys.exit(1)