Spaces:

Propelis
/

QC_Rules

Sleeping

File size: 3,087 Bytes

6c16992

#!/usr/bin/env python3
"""

Test script for metadata extraction functionality

"""
import os
import sys
from src.extract_text.extract_meta_data import PDFArtworkMetadataExtractor

def test_metadata_extraction():
    """Test the metadata extraction on a sample PDF"""
    
    # Check if we have any PDF files in the requirements library
    base_path = "requirements_library/client-requirements"
    
    if not os.path.exists(base_path):
        print("❌ No requirements library found")
        return False
    
    # Find the first PDF file
    pdf_file = None
    for root, dirs, files in os.walk(base_path):
        for file in files:
            if file.lower().endswith('.pdf'):
                pdf_file = os.path.join(root, file)
                break
        if pdf_file:
            break
    
    if not pdf_file:
        print("❌ No PDF files found in requirements library")
        return False
    
    print(f"📄 Testing metadata extraction on: {pdf_file}")
    
    try:
        # Initialize the extractor
        extractor = PDFArtworkMetadataExtractor()
        
        # Extract metadata
        metadata = extractor.extract_metadata(pdf_file)
        
        if 'error' in metadata:
            print(f"❌ Error extracting metadata: {metadata['error']}")
            return False
        
        # Print results
        print("✅ Metadata extraction successful!")
        print(f"📊 Pages processed: {metadata.get('pages_processed', 0)}")
        print(f"📝 Has selectable text: {metadata.get('has_selectable_text', False)}")
        print(f"🔧 Extraction method: {metadata.get('extraction_method', 'unknown')}")
        
        # Show top fonts
        fonts = metadata.get('fonts', {})
        if fonts:
            print("\n🔤 Top 3 Fonts:")
            for i, (font, count) in enumerate(list(fonts.items())[:3]):
                print(f"  {i+1}. {font}: {count} characters")
        
        # Show top font sizes
        font_sizes = metadata.get('font_sizes', {})
        if font_sizes:
            print("\n📏 Top 3 Font Sizes:")
            for i, (size, count) in enumerate(list(font_sizes.items())[:3]):
                print(f"  {i+1}. {size}pt: {count} characters")
        
        # Show top colors
        colors = metadata.get('text_colors', {})
        if colors:
            print("\n🎨 Top 3 Text Colors:")
            for i, (color, count) in enumerate(list(colors.items())[:3]):
                print(f"  {i+1}. RGB{color}: {count} characters")
        
        return True
        
    except Exception as e:
        print(f"❌ Test failed with error: {str(e)}")
        return False

if __name__ == "__main__":
    print("🧪 Testing Metadata Extraction")
    print("=" * 40)
    
    success = test_metadata_extraction()
    
    if success:
        print("\n✅ All tests passed! Metadata extraction is working correctly.")
    else:
        print("\n❌ Tests failed. Please check the error messages above.")
        sys.exit(1)