File size: 940 Bytes
5f6d148
 
 
 
6817692
5f6d148
 
 
 
6817692
5f6d148
 
 
 
 
 
 
6817692
5f6d148
 
 
 
 
 
6817692
5f6d148
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
"""
document_ops.py
Utilities for reading PDFs/TXT and chunking text.
"""

from io import BytesIO
from pathlib import Path
from typing import List
from PyPDF2 import PdfReader

async def pdf_to_text_fileobj(fileobj) -> str:
    data = BytesIO(await fileobj.read())
    reader = PdfReader(data)
    pages = []
    for p in reader.pages:
        pages.append(p.extract_text() or "")
    return "\n".join(pages)

def read_text_fileobj(fileobj) -> str:
    fileobj.file.seek(0)
    b = fileobj.file.read()
    if isinstance(b, bytes):
        return b.decode("utf-8", errors="ignore")
    return str(b)

def chunk_text(text: str, chunk_size: int = 1000, overlap: int = 200) -> List[str]:
    if not text:
        return []
    chunks = []
    start = 0
    L = len(text)
    while start < L:
        end = start + chunk_size
        chunk = text[start:end]
        chunks.append(chunk)
        start = max(end - overlap, end)
    return chunks