| """ | |
| document_ops.py | |
| Utilities for reading PDFs/TXT and chunking text. | |
| """ | |
| from io import BytesIO | |
| from pathlib import Path | |
| from typing import List | |
| from PyPDF2 import PdfReader | |
| async def pdf_to_text_fileobj(fileobj) -> str: | |
| data = BytesIO(await fileobj.read()) | |
| reader = PdfReader(data) | |
| pages = [] | |
| for p in reader.pages: | |
| pages.append(p.extract_text() or "") | |
| return "\n".join(pages) | |
| def read_text_fileobj(fileobj) -> str: | |
| fileobj.file.seek(0) | |
| b = fileobj.file.read() | |
| if isinstance(b, bytes): | |
| return b.decode("utf-8", errors="ignore") | |
| return str(b) | |
| def chunk_text(text: str, chunk_size: int = 1000, overlap: int = 200) -> List[str]: | |
| if not text: | |
| return [] | |
| chunks = [] | |
| start = 0 | |
| L = len(text) | |
| while start < L: | |
| end = start + chunk_size | |
| chunk = text[start:end] | |
| chunks.append(chunk) | |
| start = max(end - overlap, end) | |
| return chunks | |