Spaces:
Runtime error
Runtime error
| """ | |
| PDF document extraction utilities. | |
| """ | |
| from pathlib import Path | |
| from typing import List, Dict, Any, Union | |
| from unstructured.partition.pdf import partition_pdf | |
| from src.config import PDF_EXTRACTION_CONFIG | |
| def extract_from_pdf(pdf_path: Union[str, Path]) -> List[Any]: | |
| """ | |
| Extract content from a PDF file using unstructured. | |
| Args: | |
| pdf_path (Union[str, Path]): Path to the PDF file | |
| Returns: | |
| List[Any]: List of extracted elements (text, tables, images) | |
| """ | |
| pdf_path = Path(pdf_path) if isinstance(pdf_path, str) else pdf_path | |
| if not pdf_path.exists(): | |
| raise FileNotFoundError(f"PDF file not found: {pdf_path}") | |
| chunks = partition_pdf(filename=pdf_path, **PDF_EXTRACTION_CONFIG) | |
| return chunks | |
| def separate_content_types(chunks: List[Any]) -> Dict[str, List[Any]]: | |
| """ | |
| Separate the extracted content into text, images, and tables. | |
| Args: | |
| chunks (List[Any]): List of extracted elements from the PDF | |
| Returns: | |
| Dict[str, List[Any]]: Dictionary with keys 'texts', 'images', 'tables' | |
| """ | |
| texts, images, tables = [], [], [] | |
| for chunk in chunks: | |
| if type(chunk).__name__ == 'Table': tables.append(chunk) | |
| elif type(chunk).__name__ == 'Image': images.append(chunk) | |
| elif type(chunk).__name__ == 'CompositeElement': | |
| texts.append(chunk) | |
| for element in chunk.metadata.orig_elements: | |
| if type(element).__name__ == 'Image': images.append(element) | |
| elif type(element).__name__ == 'Table': tables.append(element) | |
| return {'texts': texts, 'images': images, 'tables': tables} |