Spaces:
Build error
Build error
| import PyPDF2 | |
| from typing import List | |
| class PDFLoader: | |
| def __init__(self, path: str): | |
| self.path = path | |
| def load_documents(self) -> List[str]: | |
| documents = [] | |
| with open(self.path, 'rb') as file: | |
| pdf_reader = PyPDF2.PdfReader(file) | |
| for page in pdf_reader.pages: | |
| documents.append(page.extract_text()) | |
| return documents | |
| class CharacterTextSplitter: | |
| def __init__(self, chunk_size: int = 1500, chunk_overlap: int = 300): | |
| self.chunk_size = chunk_size | |
| self.chunk_overlap = chunk_overlap | |
| def split_texts(self, texts: List[str]) -> List[str]: | |
| split_texts = [] | |
| for text in texts: | |
| split_texts.extend(self._split_text(text)) | |
| return split_texts | |
| def _split_text(self, text: str) -> List[str]: | |
| chunks = [] | |
| start = 0 | |
| while start < len(text): | |
| end = min(start + self.chunk_size, len(text)) | |
| chunks.append(text[start:end]) | |
| start = end - self.chunk_overlap | |
| return chunks |