from pathlib import Path
from typing import Optional

import weave
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain_core.documents import Document


class UnstructuredDocumentLoader(weave.Model):
    """A class for loading and transforming unstructured PDF documents.

    This class provides functionality for extracting text, tables, and images
    from PDFs using different processing strategies.
    """

    strategy: str
    mode: str
    include_page_breaks: bool
    infer_table_structure: bool
    ocr_languages: Optional[str]
    languages: Optional[list[str]]
    hi_res_model_name: Optional[str]
    extract_images_in_pdf: bool
    extract_image_block_types: Optional[list[str]]
    extract_image_block_output_dir: Optional[str]
    extract_image_block_to_payload: bool
    starting_page_number: int
    extract_forms: bool
    form_extraction_skip_tables: bool

    def __init__(
        self,
        strategy: str = "hi_res",
        mode: str = "elements",
        include_page_breaks: bool = False,
        infer_table_structure: bool = False,
        ocr_languages: Optional[str] = None,
        languages: Optional[list[str]] = None,
        hi_res_model_name: Optional[str] = None,
        extract_images_in_pdf: bool = False,
        extract_image_block_types: Optional[list[str]] = None,
        extract_image_block_output_dir: Optional[str] = None,
        extract_image_block_to_payload: bool = False,
        starting_page_number: int = 1,
        extract_forms: bool = False,
        form_extraction_skip_tables: bool = True,
    ):
        """Initialize the document loader with configuration parameters.

        Args:
            strategy (str): The strategy for document processing (e.g., "hi_res").
            mode (str): The mode of extraction (e.g., "elements").
            include_page_breaks (bool): Whether to include page breaks.
            infer_table_structure (bool): Whether to infer table structures.
            ocr_languages (Optional[str]): Languages for OCR processing.
            languages (Optional[List[str]]): List of languages for document processing.
            hi_res_model_name (Optional[str]): Model name for high-resolution processing.
            extract_images_in_pdf (bool): Whether to extract images from PDFs.
            extract_image_block_types (Optional[List[str]]): Types of image blocks to extract.
            extract_image_block_output_dir (Optional[str]): Directory to save extracted images.
            extract_image_block_to_payload (bool): Whether to add extracted images to payload.
            starting_page_number (int): Page number from which extraction should start.
            extract_forms (bool): Whether to extract form data.
            form_extraction_skip_tables (bool): Whether to skip tables during form extraction.
        """
        super().__init__(
            strategy=strategy,
            mode=mode,
            include_page_breaks=include_page_breaks,
            infer_table_structure=infer_table_structure,
            ocr_languages=ocr_languages,
            languages=languages,
            hi_res_model_name=hi_res_model_name,
            extract_images_in_pdf=extract_images_in_pdf,
            extract_image_block_types=extract_image_block_types,
            extract_image_block_output_dir=extract_image_block_output_dir,
            extract_image_block_to_payload=extract_image_block_to_payload,
            starting_page_number=starting_page_number,
            extract_forms=extract_forms,
            form_extraction_skip_tables=form_extraction_skip_tables,
        )

    def _get_all_file_paths_from_directory(self, directory_path: str) -> list[str]:
        """Retrieve all file paths from a given directory (recursively).

        Args:
            directory_path (str): Path to the directory.

        Returns:
            List[str]: A list of file paths.

        Raises:
            ValueError: If the directory does not exist or is not a directory.
        """
        path = Path(directory_path).resolve()  # Convert to absolute path

        if not path.exists():
            msg = f"Directory does not exist: {directory_path}"
            raise ValueError(msg)
        if not path.is_dir():
            msg = f"Path is not a directory: {directory_path}"
            raise ValueError(msg)

        return [str(file) for file in path.rglob("*") if file.is_file()]  # Get only files

    def transform_documents(self, directory_path: str) -> list[Document]:
        """Transform all documents in the given directory into structured format.

        This method loads PDFs from the specified directory and processes them
        using the UnstructuredPDFLoader.

        Args:
            directory_path (str): Path to the directory containing PDF files.

        Returns:
            List[Document]: A list of structured documents.
        """
        file_paths = self._get_all_file_paths_from_directory(directory_path)

        documents: list[Document] = []

        for file in file_paths:
            loader = UnstructuredPDFLoader(
                file_path=file,
                mode=self.mode,
                strategy=self.strategy,
                include_page_breaks=self.include_page_breaks,
                infer_table_structure=self.infer_table_structure,
                ocr_languages=self.ocr_languages,
                languages=self.languages,
                hi_res_model_name=self.hi_res_model_name,
                extract_images_in_pdf=self.extract_images_in_pdf,
                extract_image_block_types=self.extract_image_block_types,
                extract_image_block_output_dir=self.extract_image_block_output_dir,
                extract_image_block_to_payload=self.extract_image_block_to_payload,
                starting_page_number=self.starting_page_number,
                extract_forms=self.extract_forms,
                form_extraction_skip_tables=self.form_extraction_skip_tables,
            )
            parsed_documents = loader.load()
            documents.extend(parsed_documents)

        return documents