File size: 2,225 Bytes
5637ddb | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 | import warnings
warnings.filterwarnings(action='ignore')
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from dotenv import load_dotenv
load_dotenv()
def fetch_document_chunks():
"""
Load and split all PDF files from the designated folder into manageable text chunks.
This function serves as the document ingestion step for the RAG pipeline.
It:
- Loads every PDF file found in the ./RAG_Documents directory
- Splits documents into overlapping chunks optimized for vector embedding
and retrieval in graphology/handwriting analysis context
Configuration (hardcoded):
- Source folder: ./RAG_Documents
- Chunk size: 850 characters
- Chunk overlap: 120 characters
- Splitter: RecursiveCharacterTextSplitter with common separators
- Includes start_index metadata for potential future reference/traceability
Returns
-------
list[langchain_core.documents.Document]
List of document chunks ready to be embedded and stored in vector database.
Each chunk contains:
- page_content: the text fragment
- metadata: source file, page number, start_index
Raises
------
FileNotFoundError
If the ./RAG_Documents directory does not exist
ValueError
If no PDF files are found or directory is empty
Notes
-----
- This function loads and splits documents **every time it is called**.
- In production, consider caching the chunks or using a persistent vector store
to avoid repeated disk I/O and splitting.
- Current parameters (850/120) are reasonable for most sentence-transformers
models and graphology-related documents.
"""
PDF_FOLDER = "./RAG_Documents"
CHUNK_SIZE = 850
CHUNK_OVERLAP = 120
loader = PyPDFDirectoryLoader(PDF_FOLDER)
docs = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=CHUNK_SIZE,
chunk_overlap=CHUNK_OVERLAP,
length_function=len,
separators=["\n\n", "\n", ". ", " ", ""],
add_start_index=True
)
chunks = text_splitter.split_documents(docs)
return chunks |