YuITC
Add application file
c8e875f
"""
PDF document extraction utilities.
"""
from pathlib import Path
from typing import List, Dict, Any, Union
from unstructured.partition.pdf import partition_pdf
from src.config import PDF_EXTRACTION_CONFIG
def extract_from_pdf(pdf_path: Union[str, Path]) -> List[Any]:
"""
Extract content from a PDF file using unstructured.
Args:
pdf_path (Union[str, Path]): Path to the PDF file
Returns:
List[Any]: List of extracted elements (text, tables, images)
"""
pdf_path = Path(pdf_path) if isinstance(pdf_path, str) else pdf_path
if not pdf_path.exists():
raise FileNotFoundError(f"PDF file not found: {pdf_path}")
chunks = partition_pdf(filename=pdf_path, **PDF_EXTRACTION_CONFIG)
return chunks
def separate_content_types(chunks: List[Any]) -> Dict[str, List[Any]]:
"""
Separate the extracted content into text, images, and tables.
Args:
chunks (List[Any]): List of extracted elements from the PDF
Returns:
Dict[str, List[Any]]: Dictionary with keys 'texts', 'images', 'tables'
"""
texts, images, tables = [], [], []
for chunk in chunks:
if type(chunk).__name__ == 'Table': tables.append(chunk)
elif type(chunk).__name__ == 'Image': images.append(chunk)
elif type(chunk).__name__ == 'CompositeElement':
texts.append(chunk)
for element in chunk.metadata.orig_elements:
if type(element).__name__ == 'Image': images.append(element)
elif type(element).__name__ == 'Table': tables.append(element)
return {'texts': texts, 'images': images, 'tables': tables}