| import warnings | |
| warnings.filterwarnings(action='ignore') | |
| from langchain_community.document_loaders import PyPDFDirectoryLoader | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from dotenv import load_dotenv | |
| load_dotenv() | |
| def fetch_document_chunks(): | |
| """ | |
| Load and split all PDF files from the designated folder into manageable text chunks. | |
| This function serves as the document ingestion step for the RAG pipeline. | |
| It: | |
| - Loads every PDF file found in the ./RAG_Documents directory | |
| - Splits documents into overlapping chunks optimized for vector embedding | |
| and retrieval in graphology/handwriting analysis context | |
| Configuration (hardcoded): | |
| - Source folder: ./RAG_Documents | |
| - Chunk size: 850 characters | |
| - Chunk overlap: 120 characters | |
| - Splitter: RecursiveCharacterTextSplitter with common separators | |
| - Includes start_index metadata for potential future reference/traceability | |
| Returns | |
| ------- | |
| list[langchain_core.documents.Document] | |
| List of document chunks ready to be embedded and stored in vector database. | |
| Each chunk contains: | |
| - page_content: the text fragment | |
| - metadata: source file, page number, start_index | |
| Raises | |
| ------ | |
| FileNotFoundError | |
| If the ./RAG_Documents directory does not exist | |
| ValueError | |
| If no PDF files are found or directory is empty | |
| Notes | |
| ----- | |
| - This function loads and splits documents **every time it is called**. | |
| - In production, consider caching the chunks or using a persistent vector store | |
| to avoid repeated disk I/O and splitting. | |
| - Current parameters (850/120) are reasonable for most sentence-transformers | |
| models and graphology-related documents. | |
| """ | |
| PDF_FOLDER = "./RAG_Documents" | |
| CHUNK_SIZE = 850 | |
| CHUNK_OVERLAP = 120 | |
| loader = PyPDFDirectoryLoader(PDF_FOLDER) | |
| docs = loader.load() | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=CHUNK_SIZE, | |
| chunk_overlap=CHUNK_OVERLAP, | |
| length_function=len, | |
| separators=["\n\n", "\n", ". ", " ", ""], | |
| add_start_index=True | |
| ) | |
| chunks = text_splitter.split_documents(docs) | |
| return chunks |