Spaces:

HF-Pawan
/

Automated-Signature-Analysis-Docker

Running

File size: 2,225 Bytes

5637ddb

import warnings
warnings.filterwarnings(action='ignore')
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

from dotenv import load_dotenv
load_dotenv()


def fetch_document_chunks():
    """
    Load and split all PDF files from the designated folder into manageable text chunks.

    This function serves as the document ingestion step for the RAG pipeline.
    It:
    - Loads every PDF file found in the ./RAG_Documents directory
    - Splits documents into overlapping chunks optimized for vector embedding
      and retrieval in graphology/handwriting analysis context

    Configuration (hardcoded):
    - Source folder: ./RAG_Documents
    - Chunk size: 850 characters
    - Chunk overlap: 120 characters
    - Splitter: RecursiveCharacterTextSplitter with common separators
    - Includes start_index metadata for potential future reference/traceability

    Returns
    -------
    list[langchain_core.documents.Document]
        List of document chunks ready to be embedded and stored in vector database.
        Each chunk contains:
        - page_content: the text fragment
        - metadata: source file, page number, start_index

    Raises
    ------
    FileNotFoundError
        If the ./RAG_Documents directory does not exist
    ValueError
        If no PDF files are found or directory is empty

    Notes
    -----
    - This function loads and splits documents **every time it is called**.
    - In production, consider caching the chunks or using a persistent vector store
      to avoid repeated disk I/O and splitting.
    - Current parameters (850/120) are reasonable for most sentence-transformers
      models and graphology-related documents.
    """

    PDF_FOLDER = "./RAG_Documents"
    CHUNK_SIZE = 850
    CHUNK_OVERLAP = 120

    loader = PyPDFDirectoryLoader(PDF_FOLDER)
    docs = loader.load()

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=CHUNK_SIZE,
        chunk_overlap=CHUNK_OVERLAP,
        length_function=len,
        separators=["\n\n", "\n", ". ", " ", ""],
        add_start_index=True
    )

    chunks = text_splitter.split_documents(docs)
    return chunks