File size: 940 Bytes
5f6d148 6817692 5f6d148 6817692 5f6d148 6817692 5f6d148 6817692 5f6d148 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 |
"""
document_ops.py
Utilities for reading PDFs/TXT and chunking text.
"""
from io import BytesIO
from pathlib import Path
from typing import List
from PyPDF2 import PdfReader
async def pdf_to_text_fileobj(fileobj) -> str:
data = BytesIO(await fileobj.read())
reader = PdfReader(data)
pages = []
for p in reader.pages:
pages.append(p.extract_text() or "")
return "\n".join(pages)
def read_text_fileobj(fileobj) -> str:
fileobj.file.seek(0)
b = fileobj.file.read()
if isinstance(b, bytes):
return b.decode("utf-8", errors="ignore")
return str(b)
def chunk_text(text: str, chunk_size: int = 1000, overlap: int = 200) -> List[str]:
if not text:
return []
chunks = []
start = 0
L = len(text)
while start < L:
end = start + chunk_size
chunk = text[start:end]
chunks.append(chunk)
start = max(end - overlap, end)
return chunks
|