"""Document processing module for loading and splitting documents""" from typing import List, Union from langchain_community.document_loaders import WebBaseLoader from langchain_text_splitters import RecursiveCharacterTextSplitter # from langchain.schema import Document from langchain_core.documents import Document from pathlib import Path from langchain_community.document_loaders import ( WebBaseLoader, PyPDFLoader, TextLoader, PyPDFDirectoryLoader ) class DocumentProcessor: """Handles document loading and processing""" def __init__(self, chunk_size: int = 500, chunk_overlap: int = 50): """ Initialize document processor Args: chunk_size: Size of text chunks chunk_overlap: Overlap between chunks """ self.chunk_size = chunk_size self.chunk_overlap = chunk_overlap self.splitter = RecursiveCharacterTextSplitter( chunk_size=chunk_size, chunk_overlap=chunk_overlap ) def process_pdf(self, file_paths: List[str]): """Load multiple PDFs and return a combined list of chunks""" all_documents = [] for path in file_paths: try: loader = PyPDFLoader(path) # This splits the specific PDF into chunks chunks = loader.load_and_split(text_splitter=self.text_splitter) # We add those chunks to our master list all_documents.extend(chunks) except Exception as e: print(f"Error loading PDF {path}: {e}") return all_documents # def process_pdf(self, file_path: str): # """Load a PDF from a file path and split into chunks""" # try: # loader = PyPDFLoader(file_path) # # Load and split in one go # documents = loader.load_and_split(text_splitter=self.text_splitter) # return documents # except Exception as e: # print(f"Error loading PDF {file_path}: {e}") # return [] def load_from_url(self, url: str) -> List[Document]: """Load document(s) from a URL""" loader = WebBaseLoader(url) return loader.load() def load_from_pdf_dir(self, directory: Union[str, Path]) -> List[Document]: """Load documents from all PDFs inside a directory""" loader = PyPDFDirectoryLoader(str(directory)) return loader.load() def load_from_txt(self, file_path: Union[str, Path]) -> List[Document]: """Load document(s) from a TXT file""" loader = TextLoader(str(file_path), encoding="utf-8") return loader.load() def load_from_pdf(self, file_path: Union[str, Path]) -> List[Document]: """Load document(s) from a PDF file""" loader = PyPDFDirectoryLoader(str("data")) return loader.load() def load_documents(self, sources: List[str]) -> List[Document]: """ Load documents from URLs, PDF directories, or TXT files Args: sources: List of URLs, PDF folder paths, or TXT file paths Returns: List of loaded documents """ docs: List[Document] = [] for src in sources: if src.startswith("http://") or src.startswith("https://"): docs.extend(self.load_from_url(src)) path = Path("data") if path.is_dir(): # PDF directory docs.extend(self.load_from_pdf_dir(path)) elif path.suffix.lower() == ".txt": docs.extend(self.load_from_txt(path)) else: raise ValueError( f"Unsupported source type: {src}. " "Use URL, .txt file, or PDF directory." ) return docs def split_documents(self, documents: List[Document]) -> List[Document]: """ Split documents into chunks Args: documents: List of documents to split Returns: List of split documents """ return self.splitter.split_documents(documents) def process_urls(self, urls: List[str]) -> List[Document]: """ Complete pipeline to load and split documents Args: urls: List of URLs to process Returns: List of processed document chunks """ docs = self.load_documents(urls) return self.split_documents(docs)