File size: 1,740 Bytes
c8e875f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
"""
PDF document extraction utilities.
"""
from pathlib                    import Path
from typing                     import List, Dict, Any, Union
from unstructured.partition.pdf import partition_pdf

from src.config import PDF_EXTRACTION_CONFIG


def extract_from_pdf(pdf_path: Union[str, Path]) -> List[Any]:
    """
    Extract content from a PDF file using unstructured.
    
    Args:
        pdf_path (Union[str, Path]): Path to the PDF file
        
    Returns:
        List[Any]: List of extracted elements (text, tables, images)
    """
    pdf_path = Path(pdf_path) if isinstance(pdf_path, str) else pdf_path
    if not pdf_path.exists():
        raise FileNotFoundError(f"PDF file not found: {pdf_path}")

    chunks = partition_pdf(filename=pdf_path, **PDF_EXTRACTION_CONFIG)
    return chunks


def separate_content_types(chunks: List[Any]) -> Dict[str, List[Any]]:
    """
    Separate the extracted content into text, images, and tables.
    
    Args:
        chunks (List[Any]): List of extracted elements from the PDF
        
    Returns:
        Dict[str, List[Any]]: Dictionary with keys 'texts', 'images', 'tables'
    """
    texts, images, tables = [], [], []
    for chunk in chunks:
        if   type(chunk).__name__ == 'Table': tables.append(chunk)
        elif type(chunk).__name__ == 'Image': images.append(chunk)
        
        elif type(chunk).__name__ == 'CompositeElement':
            texts.append(chunk)
            
            for element in chunk.metadata.orig_elements:
                if   type(element).__name__ == 'Image': images.append(element)
                elif type(element).__name__ == 'Table': tables.append(element)
    
    return {'texts': texts, 'images': images, 'tables': tables}