#!/usr/bin/env python3 """ Test script for metadata extraction functionality """ import os import sys from src.extract_text.extract_meta_data import PDFArtworkMetadataExtractor def test_metadata_extraction(): """Test the metadata extraction on a sample PDF""" # Check if we have any PDF files in the requirements library base_path = "requirements_library/client-requirements" if not os.path.exists(base_path): print("โŒ No requirements library found") return False # Find the first PDF file pdf_file = None for root, dirs, files in os.walk(base_path): for file in files: if file.lower().endswith('.pdf'): pdf_file = os.path.join(root, file) break if pdf_file: break if not pdf_file: print("โŒ No PDF files found in requirements library") return False print(f"๐Ÿ“„ Testing metadata extraction on: {pdf_file}") try: # Initialize the extractor extractor = PDFArtworkMetadataExtractor() # Extract metadata metadata = extractor.extract_metadata(pdf_file) if 'error' in metadata: print(f"โŒ Error extracting metadata: {metadata['error']}") return False # Print results print("โœ… Metadata extraction successful!") print(f"๐Ÿ“Š Pages processed: {metadata.get('pages_processed', 0)}") print(f"๐Ÿ“ Has selectable text: {metadata.get('has_selectable_text', False)}") print(f"๐Ÿ”ง Extraction method: {metadata.get('extraction_method', 'unknown')}") # Show top fonts fonts = metadata.get('fonts', {}) if fonts: print("\n๐Ÿ”ค Top 3 Fonts:") for i, (font, count) in enumerate(list(fonts.items())[:3]): print(f" {i+1}. {font}: {count} characters") # Show top font sizes font_sizes = metadata.get('font_sizes', {}) if font_sizes: print("\n๐Ÿ“ Top 3 Font Sizes:") for i, (size, count) in enumerate(list(font_sizes.items())[:3]): print(f" {i+1}. {size}pt: {count} characters") # Show top colors colors = metadata.get('text_colors', {}) if colors: print("\n๐ŸŽจ Top 3 Text Colors:") for i, (color, count) in enumerate(list(colors.items())[:3]): print(f" {i+1}. RGB{color}: {count} characters") return True except Exception as e: print(f"โŒ Test failed with error: {str(e)}") return False if __name__ == "__main__": print("๐Ÿงช Testing Metadata Extraction") print("=" * 40) success = test_metadata_extraction() if success: print("\nโœ… All tests passed! Metadata extraction is working correctly.") else: print("\nโŒ Tests failed. Please check the error messages above.") sys.exit(1)