demo2 / src /document_ingestion /document_processor.py
Dinesh310's picture
Update src/document_ingestion/document_processor.py
c086254 verified
"""Document processing module for loading and splitting documents"""
from typing import List, Union
from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
# from langchain.schema import Document
from langchain_core.documents import Document
from pathlib import Path
from langchain_community.document_loaders import (
WebBaseLoader,
PyPDFLoader,
TextLoader,
PyPDFDirectoryLoader
)
class DocumentProcessor:
"""Handles document loading and processing"""
def __init__(self, chunk_size: int = 500, chunk_overlap: int = 50):
"""
Initialize document processor
Args:
chunk_size: Size of text chunks
chunk_overlap: Overlap between chunks
"""
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
self.splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap
)
def process_pdf(self, file_paths: List[str]):
"""Load multiple PDFs and return a combined list of chunks"""
all_documents = []
for path in file_paths:
try:
loader = PyPDFLoader(path)
# This splits the specific PDF into chunks
chunks = loader.load_and_split(text_splitter=self.text_splitter)
# We add those chunks to our master list
all_documents.extend(chunks)
except Exception as e:
print(f"Error loading PDF {path}: {e}")
return all_documents
# def process_pdf(self, file_path: str):
# """Load a PDF from a file path and split into chunks"""
# try:
# loader = PyPDFLoader(file_path)
# # Load and split in one go
# documents = loader.load_and_split(text_splitter=self.text_splitter)
# return documents
# except Exception as e:
# print(f"Error loading PDF {file_path}: {e}")
# return []
def load_from_url(self, url: str) -> List[Document]:
"""Load document(s) from a URL"""
loader = WebBaseLoader(url)
return loader.load()
def load_from_pdf_dir(self, directory: Union[str, Path]) -> List[Document]:
"""Load documents from all PDFs inside a directory"""
loader = PyPDFDirectoryLoader(str(directory))
return loader.load()
def load_from_txt(self, file_path: Union[str, Path]) -> List[Document]:
"""Load document(s) from a TXT file"""
loader = TextLoader(str(file_path), encoding="utf-8")
return loader.load()
def load_from_pdf(self, file_path: Union[str, Path]) -> List[Document]:
"""Load document(s) from a PDF file"""
loader = PyPDFDirectoryLoader(str("data"))
return loader.load()
def load_documents(self, sources: List[str]) -> List[Document]:
"""
Load documents from URLs, PDF directories, or TXT files
Args:
sources: List of URLs, PDF folder paths, or TXT file paths
Returns:
List of loaded documents
"""
docs: List[Document] = []
for src in sources:
if src.startswith("http://") or src.startswith("https://"):
docs.extend(self.load_from_url(src))
path = Path("data")
if path.is_dir(): # PDF directory
docs.extend(self.load_from_pdf_dir(path))
elif path.suffix.lower() == ".txt":
docs.extend(self.load_from_txt(path))
else:
raise ValueError(
f"Unsupported source type: {src}. "
"Use URL, .txt file, or PDF directory."
)
return docs
def split_documents(self, documents: List[Document]) -> List[Document]:
"""
Split documents into chunks
Args:
documents: List of documents to split
Returns:
List of split documents
"""
return self.splitter.split_documents(documents)
def process_urls(self, urls: List[str]) -> List[Document]:
"""
Complete pipeline to load and split documents
Args:
urls: List of URLs to process
Returns:
List of processed document chunks
"""
docs = self.load_documents(urls)
return self.split_documents(docs)