Spaces:
Running
Running
| from typing import List | |
| from langchain_community.document_loaders import WebBaseLoader, PyPDFLoader, TextLoader | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from langchain_core.documents import Document | |
| class DocumentProcessor: | |
| """Loads and splits documents into chunks for RAG.""" | |
| def __init__(self, chunk_size: int = 400, chunk_overlap: int = 80) -> None: | |
| self.splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=chunk_size, | |
| chunk_overlap=chunk_overlap, | |
| ) | |
| def load_url(self, url: str) -> List[Document]: | |
| return WebBaseLoader(url).load() | |
| def load_pdf(self, file_path: str) -> List[Document]: | |
| return PyPDFLoader(file_path).load() | |
| def load_txt(self, file_path: str) -> List[Document]: | |
| return TextLoader(file_path, encoding="utf-8").load() | |
| def split(self, docs: List[Document]) -> List[Document]: | |
| return self.splitter.split_documents(docs) | |