test-ragp / src /rag_pipelines /unstructured /unstructured_pdf_loader.py

Upload 107 files

336f4a9 verified 11 months ago

6.11 kB

	from pathlib import Path
	from typing import Optional

	import weave
	from langchain_community.document_loaders import UnstructuredPDFLoader
	from langchain_core.documents import Document


	class UnstructuredDocumentLoader(weave.Model):
	"""A class for loading and transforming unstructured PDF documents.

	This class provides functionality for extracting text, tables, and images
	from PDFs using different processing strategies.
	"""

	strategy: str
	mode: str
	include_page_breaks: bool
	infer_table_structure: bool
	ocr_languages: Optional[str]
	languages: Optional[list[str]]
	hi_res_model_name: Optional[str]
	extract_images_in_pdf: bool
	extract_image_block_types: Optional[list[str]]
	extract_image_block_output_dir: Optional[str]
	extract_image_block_to_payload: bool
	starting_page_number: int
	extract_forms: bool
	form_extraction_skip_tables: bool

	def __init__(
	self,
	strategy: str = "hi_res",
	mode: str = "elements",
	include_page_breaks: bool = False,
	infer_table_structure: bool = False,
	ocr_languages: Optional[str] = None,
	languages: Optional[list[str]] = None,
	hi_res_model_name: Optional[str] = None,
	extract_images_in_pdf: bool = False,
	extract_image_block_types: Optional[list[str]] = None,
	extract_image_block_output_dir: Optional[str] = None,
	extract_image_block_to_payload: bool = False,
	starting_page_number: int = 1,
	extract_forms: bool = False,
	form_extraction_skip_tables: bool = True,
	):
	"""Initialize the document loader with configuration parameters.

	Args:
	strategy (str): The strategy for document processing (e.g., "hi_res").
	mode (str): The mode of extraction (e.g., "elements").
	include_page_breaks (bool): Whether to include page breaks.
	infer_table_structure (bool): Whether to infer table structures.
	ocr_languages (Optional[str]): Languages for OCR processing.
	languages (Optional[List[str]]): List of languages for document processing.
	hi_res_model_name (Optional[str]): Model name for high-resolution processing.
	extract_images_in_pdf (bool): Whether to extract images from PDFs.
	extract_image_block_types (Optional[List[str]]): Types of image blocks to extract.
	extract_image_block_output_dir (Optional[str]): Directory to save extracted images.
	extract_image_block_to_payload (bool): Whether to add extracted images to payload.
	starting_page_number (int): Page number from which extraction should start.
	extract_forms (bool): Whether to extract form data.
	form_extraction_skip_tables (bool): Whether to skip tables during form extraction.
	"""
	super().__init__(
	strategy=strategy,
	mode=mode,
	include_page_breaks=include_page_breaks,
	infer_table_structure=infer_table_structure,
	ocr_languages=ocr_languages,
	languages=languages,
	hi_res_model_name=hi_res_model_name,
	extract_images_in_pdf=extract_images_in_pdf,
	extract_image_block_types=extract_image_block_types,
	extract_image_block_output_dir=extract_image_block_output_dir,
	extract_image_block_to_payload=extract_image_block_to_payload,
	starting_page_number=starting_page_number,
	extract_forms=extract_forms,
	form_extraction_skip_tables=form_extraction_skip_tables,
	)

	def _get_all_file_paths_from_directory(self, directory_path: str) -> list[str]:
	"""Retrieve all file paths from a given directory (recursively).

	Args:
	directory_path (str): Path to the directory.

	Returns:
	List[str]: A list of file paths.

	Raises:
	ValueError: If the directory does not exist or is not a directory.
	"""
	path = Path(directory_path).resolve() # Convert to absolute path

	if not path.exists():
	msg = f"Directory does not exist: {directory_path}"
	raise ValueError(msg)
	if not path.is_dir():
	msg = f"Path is not a directory: {directory_path}"
	raise ValueError(msg)

	return [str(file) for file in path.rglob("*") if file.is_file()] # Get only files

	def transform_documents(self, directory_path: str) -> list[Document]:
	"""Transform all documents in the given directory into structured format.

	This method loads PDFs from the specified directory and processes them
	using the UnstructuredPDFLoader.

	Args:
	directory_path (str): Path to the directory containing PDF files.

	Returns:
	List[Document]: A list of structured documents.
	"""
	file_paths = self._get_all_file_paths_from_directory(directory_path)

	documents: list[Document] = []

	for file in file_paths:
	loader = UnstructuredPDFLoader(
	file_path=file,
	mode=self.mode,
	strategy=self.strategy,
	include_page_breaks=self.include_page_breaks,
	infer_table_structure=self.infer_table_structure,
	ocr_languages=self.ocr_languages,
	languages=self.languages,
	hi_res_model_name=self.hi_res_model_name,
	extract_images_in_pdf=self.extract_images_in_pdf,
	extract_image_block_types=self.extract_image_block_types,
	extract_image_block_output_dir=self.extract_image_block_output_dir,
	extract_image_block_to_payload=self.extract_image_block_to_payload,
	starting_page_number=self.starting_page_number,
	extract_forms=self.extract_forms,
	form_extraction_skip_tables=self.form_extraction_skip_tables,
	)
	parsed_documents = loader.load()
	documents.extend(parsed_documents)

	return documents