Spaces:
Running
Running
File size: 966 Bytes
b02630d ee0f8f3 b02630d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 | from typing import List
from langchain_community.document_loaders import WebBaseLoader, PyPDFLoader, TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
class DocumentProcessor:
"""Loads and splits documents into chunks for RAG."""
def __init__(self, chunk_size: int = 400, chunk_overlap: int = 80) -> None:
self.splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
)
def load_url(self, url: str) -> List[Document]:
return WebBaseLoader(url).load()
def load_pdf(self, file_path: str) -> List[Document]:
return PyPDFLoader(file_path).load()
def load_txt(self, file_path: str) -> List[Document]:
return TextLoader(file_path, encoding="utf-8").load()
def split(self, docs: List[Document]) -> List[Document]:
return self.splitter.split_documents(docs)
|