|
|
from pathlib import Path |
|
|
from typing import Optional |
|
|
|
|
|
import weave |
|
|
from langchain_community.document_loaders import UnstructuredPDFLoader |
|
|
from langchain_core.documents import Document |
|
|
|
|
|
|
|
|
class UnstructuredDocumentLoader(weave.Model): |
|
|
"""A class for loading and transforming unstructured PDF documents. |
|
|
|
|
|
This class provides functionality for extracting text, tables, and images |
|
|
from PDFs using different processing strategies. |
|
|
""" |
|
|
|
|
|
strategy: str |
|
|
mode: str |
|
|
include_page_breaks: bool |
|
|
infer_table_structure: bool |
|
|
ocr_languages: Optional[str] |
|
|
languages: Optional[list[str]] |
|
|
hi_res_model_name: Optional[str] |
|
|
extract_images_in_pdf: bool |
|
|
extract_image_block_types: Optional[list[str]] |
|
|
extract_image_block_output_dir: Optional[str] |
|
|
extract_image_block_to_payload: bool |
|
|
starting_page_number: int |
|
|
extract_forms: bool |
|
|
form_extraction_skip_tables: bool |
|
|
|
|
|
def __init__( |
|
|
self, |
|
|
strategy: str = "hi_res", |
|
|
mode: str = "elements", |
|
|
include_page_breaks: bool = False, |
|
|
infer_table_structure: bool = False, |
|
|
ocr_languages: Optional[str] = None, |
|
|
languages: Optional[list[str]] = None, |
|
|
hi_res_model_name: Optional[str] = None, |
|
|
extract_images_in_pdf: bool = False, |
|
|
extract_image_block_types: Optional[list[str]] = None, |
|
|
extract_image_block_output_dir: Optional[str] = None, |
|
|
extract_image_block_to_payload: bool = False, |
|
|
starting_page_number: int = 1, |
|
|
extract_forms: bool = False, |
|
|
form_extraction_skip_tables: bool = True, |
|
|
): |
|
|
"""Initialize the document loader with configuration parameters. |
|
|
|
|
|
Args: |
|
|
strategy (str): The strategy for document processing (e.g., "hi_res"). |
|
|
mode (str): The mode of extraction (e.g., "elements"). |
|
|
include_page_breaks (bool): Whether to include page breaks. |
|
|
infer_table_structure (bool): Whether to infer table structures. |
|
|
ocr_languages (Optional[str]): Languages for OCR processing. |
|
|
languages (Optional[List[str]]): List of languages for document processing. |
|
|
hi_res_model_name (Optional[str]): Model name for high-resolution processing. |
|
|
extract_images_in_pdf (bool): Whether to extract images from PDFs. |
|
|
extract_image_block_types (Optional[List[str]]): Types of image blocks to extract. |
|
|
extract_image_block_output_dir (Optional[str]): Directory to save extracted images. |
|
|
extract_image_block_to_payload (bool): Whether to add extracted images to payload. |
|
|
starting_page_number (int): Page number from which extraction should start. |
|
|
extract_forms (bool): Whether to extract form data. |
|
|
form_extraction_skip_tables (bool): Whether to skip tables during form extraction. |
|
|
""" |
|
|
super().__init__( |
|
|
strategy=strategy, |
|
|
mode=mode, |
|
|
include_page_breaks=include_page_breaks, |
|
|
infer_table_structure=infer_table_structure, |
|
|
ocr_languages=ocr_languages, |
|
|
languages=languages, |
|
|
hi_res_model_name=hi_res_model_name, |
|
|
extract_images_in_pdf=extract_images_in_pdf, |
|
|
extract_image_block_types=extract_image_block_types, |
|
|
extract_image_block_output_dir=extract_image_block_output_dir, |
|
|
extract_image_block_to_payload=extract_image_block_to_payload, |
|
|
starting_page_number=starting_page_number, |
|
|
extract_forms=extract_forms, |
|
|
form_extraction_skip_tables=form_extraction_skip_tables, |
|
|
) |
|
|
|
|
|
def _get_all_file_paths_from_directory(self, directory_path: str) -> list[str]: |
|
|
"""Retrieve all file paths from a given directory (recursively). |
|
|
|
|
|
Args: |
|
|
directory_path (str): Path to the directory. |
|
|
|
|
|
Returns: |
|
|
List[str]: A list of file paths. |
|
|
|
|
|
Raises: |
|
|
ValueError: If the directory does not exist or is not a directory. |
|
|
""" |
|
|
path = Path(directory_path).resolve() |
|
|
|
|
|
if not path.exists(): |
|
|
msg = f"Directory does not exist: {directory_path}" |
|
|
raise ValueError(msg) |
|
|
if not path.is_dir(): |
|
|
msg = f"Path is not a directory: {directory_path}" |
|
|
raise ValueError(msg) |
|
|
|
|
|
return [str(file) for file in path.rglob("*") if file.is_file()] |
|
|
|
|
|
def transform_documents(self, directory_path: str) -> list[Document]: |
|
|
"""Transform all documents in the given directory into structured format. |
|
|
|
|
|
This method loads PDFs from the specified directory and processes them |
|
|
using the UnstructuredPDFLoader. |
|
|
|
|
|
Args: |
|
|
directory_path (str): Path to the directory containing PDF files. |
|
|
|
|
|
Returns: |
|
|
List[Document]: A list of structured documents. |
|
|
""" |
|
|
file_paths = self._get_all_file_paths_from_directory(directory_path) |
|
|
|
|
|
documents: list[Document] = [] |
|
|
|
|
|
for file in file_paths: |
|
|
loader = UnstructuredPDFLoader( |
|
|
file_path=file, |
|
|
mode=self.mode, |
|
|
strategy=self.strategy, |
|
|
include_page_breaks=self.include_page_breaks, |
|
|
infer_table_structure=self.infer_table_structure, |
|
|
ocr_languages=self.ocr_languages, |
|
|
languages=self.languages, |
|
|
hi_res_model_name=self.hi_res_model_name, |
|
|
extract_images_in_pdf=self.extract_images_in_pdf, |
|
|
extract_image_block_types=self.extract_image_block_types, |
|
|
extract_image_block_output_dir=self.extract_image_block_output_dir, |
|
|
extract_image_block_to_payload=self.extract_image_block_to_payload, |
|
|
starting_page_number=self.starting_page_number, |
|
|
extract_forms=self.extract_forms, |
|
|
form_extraction_skip_tables=self.form_extraction_skip_tables, |
|
|
) |
|
|
parsed_documents = loader.load() |
|
|
documents.extend(parsed_documents) |
|
|
|
|
|
return documents |
|
|
|