Try / src /agenticRAG /components /document_parsing.py
Alamgirapi's picture
Upload folder src
b325aad verified
import os
from typing import List, Union
from pathlib import Path
# LangChain imports
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import (
PyPDFLoader,
Docx2txtLoader,
TextLoader,
UnstructuredMarkdownLoader
)
from langchain.schema import Document
class DocumentChunker:
"""
A class to read various document types and chunk them using LangChain
"""
def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200):
"""
Initialize the DocumentChunker
Args:
chunk_size (int): Size of each chunk in characters
chunk_overlap (int): Number of characters to overlap between chunks
"""
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
self.text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
length_function=len,
separators=["\n\n", "\n", " ", ""]
)
def read_pdf(self, file_path: str) -> List[Document]:
"""Read PDF file and return documents"""
try:
loader = PyPDFLoader(file_path)
documents = loader.load()
return documents
except Exception as e:
print(f"Error reading PDF file {file_path}: {e}")
return []
def read_docx(self, file_path: str) -> List[Document]:
"""Read DOCX file and return documents"""
try:
loader = Docx2txtLoader(file_path)
documents = loader.load()
return documents
except Exception as e:
print(f"Error reading DOCX file {file_path}: {e}")
return []
def read_txt(self, file_path: str) -> List[Document]:
"""Read TXT file and return documents"""
try:
loader = TextLoader(file_path, encoding='utf-8')
documents = loader.load()
return documents
except Exception as e:
print(f"Error reading TXT file {file_path}: {e}")
return []
def read_md(self, file_path: str) -> List[Document]:
"""Read Markdown file and return documents"""
try:
loader = UnstructuredMarkdownLoader(file_path)
documents = loader.load()
return documents
except Exception as e:
print(f"Error reading MD file {file_path}: {e}")
return []
def load_document(self, file_path: str) -> List[Document]:
"""
Load document based on file extension
Args:
file_path (str): Path to the document file
Returns:
List[Document]: List of loaded documents
"""
file_extension = Path(file_path).suffix.lower()
if file_extension == '.pdf':
return self.read_pdf(file_path)
elif file_extension == '.docx':
return self.read_docx(file_path)
elif file_extension == '.txt':
return self.read_txt(file_path)
elif file_extension == '.md':
return self.read_md(file_path)
else:
print(f"Unsupported file type: {file_extension}")
return []
def chunk_documents(self, documents: List[Document]) -> List[str]:
"""
Chunk documents and return list of strings
Args:
documents (List[Document]): List of documents to chunk
Returns:
List[str]: List of chunked text strings
"""
if not documents:
return []
# Split documents into chunks
chunks = self.text_splitter.split_documents(documents)
# Extract text content from chunks
chunk_texts = [chunk.page_content for chunk in chunks]
return chunk_texts
def process_file(self, file_path: str) -> List[str]:
"""
Process a single file: load and chunk it
Args:
file_path (str): Path to the file to process
Returns:
List[str]: List of chunked text strings
"""
if not os.path.exists(file_path):
print(f"File not found: {file_path}")
return []
# Load document
documents = self.load_document(file_path)
if not documents:
print(f"No content loaded from {file_path}")
return []
# Chunk documents
chunks = self.chunk_documents(documents)
print(f"Successfully processed {file_path}: {len(chunks)} chunks created")
return chunks
def process_multiple_files(self, file_paths: List[str]) -> List[str]:
"""
Process multiple files and return combined chunks
Args:
file_paths (List[str]): List of file paths to process
Returns:
List[str]: Combined list of chunked text strings
"""
all_chunks = []
for file_path in file_paths:
chunks = self.process_file(file_path)
all_chunks.extend(chunks)
return all_chunks
# Example usage and utility functions
def main():
"""Example usage of the DocumentChunker class"""
# Initialize chunker with custom parameters
chunker = DocumentChunker(chunk_size=800, chunk_overlap=100)
# Example: Process a single file
file_path = "example.pdf" # Replace with your file path
chunks = chunker.process_file(file_path)
if chunks:
print(f"Total chunks: {len(chunks)}")
print("\nFirst chunk preview:")
print(chunks[0][:200] + "..." if len(chunks[0]) > 200 else chunks[0])
# Example: Process multiple files
file_paths = [
"document1.pdf",
"document2.docx",
"document3.txt",
"document4.md"
]
all_chunks = chunker.process_multiple_files(file_paths)
print(f"\nTotal chunks from all files: {len(all_chunks)}")
return all_chunks
def create_chunker_with_custom_settings(chunk_size: int = 1000,
chunk_overlap: int = 200) -> DocumentChunker:
"""
Create a DocumentChunker with custom settings
Args:
chunk_size (int): Size of each chunk
chunk_overlap (int): Overlap between chunks
Returns:
DocumentChunker: Configured chunker instance
"""
return DocumentChunker(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
if __name__ == "__main__":
main()