File size: 4,566 Bytes
1b7129e 1ba8003 c086254 a76c973 1ba8003 1b7129e 8d56dc3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 |
"""Document processing module for loading and splitting documents"""
from typing import List, Union
from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
# from langchain.schema import Document
from langchain_core.documents import Document
from pathlib import Path
from langchain_community.document_loaders import (
WebBaseLoader,
PyPDFLoader,
TextLoader,
PyPDFDirectoryLoader
)
class DocumentProcessor:
"""Handles document loading and processing"""
def __init__(self, chunk_size: int = 500, chunk_overlap: int = 50):
"""
Initialize document processor
Args:
chunk_size: Size of text chunks
chunk_overlap: Overlap between chunks
"""
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
self.splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap
)
def process_pdf(self, file_paths: List[str]):
"""Load multiple PDFs and return a combined list of chunks"""
all_documents = []
for path in file_paths:
try:
loader = PyPDFLoader(path)
# This splits the specific PDF into chunks
chunks = loader.load_and_split(text_splitter=self.text_splitter)
# We add those chunks to our master list
all_documents.extend(chunks)
except Exception as e:
print(f"Error loading PDF {path}: {e}")
return all_documents
# def process_pdf(self, file_path: str):
# """Load a PDF from a file path and split into chunks"""
# try:
# loader = PyPDFLoader(file_path)
# # Load and split in one go
# documents = loader.load_and_split(text_splitter=self.text_splitter)
# return documents
# except Exception as e:
# print(f"Error loading PDF {file_path}: {e}")
# return []
def load_from_url(self, url: str) -> List[Document]:
"""Load document(s) from a URL"""
loader = WebBaseLoader(url)
return loader.load()
def load_from_pdf_dir(self, directory: Union[str, Path]) -> List[Document]:
"""Load documents from all PDFs inside a directory"""
loader = PyPDFDirectoryLoader(str(directory))
return loader.load()
def load_from_txt(self, file_path: Union[str, Path]) -> List[Document]:
"""Load document(s) from a TXT file"""
loader = TextLoader(str(file_path), encoding="utf-8")
return loader.load()
def load_from_pdf(self, file_path: Union[str, Path]) -> List[Document]:
"""Load document(s) from a PDF file"""
loader = PyPDFDirectoryLoader(str("data"))
return loader.load()
def load_documents(self, sources: List[str]) -> List[Document]:
"""
Load documents from URLs, PDF directories, or TXT files
Args:
sources: List of URLs, PDF folder paths, or TXT file paths
Returns:
List of loaded documents
"""
docs: List[Document] = []
for src in sources:
if src.startswith("http://") or src.startswith("https://"):
docs.extend(self.load_from_url(src))
path = Path("data")
if path.is_dir(): # PDF directory
docs.extend(self.load_from_pdf_dir(path))
elif path.suffix.lower() == ".txt":
docs.extend(self.load_from_txt(path))
else:
raise ValueError(
f"Unsupported source type: {src}. "
"Use URL, .txt file, or PDF directory."
)
return docs
def split_documents(self, documents: List[Document]) -> List[Document]:
"""
Split documents into chunks
Args:
documents: List of documents to split
Returns:
List of split documents
"""
return self.splitter.split_documents(documents)
def process_urls(self, urls: List[str]) -> List[Document]:
"""
Complete pipeline to load and split documents
Args:
urls: List of URLs to process
Returns:
List of processed document chunks
"""
docs = self.load_documents(urls)
return self.split_documents(docs) |