Spaces:

LeonardoMdSA
/

LLMOps-RAG_solution-HS_spaces

Running

File size: 940 Bytes

"""
document_ops.py
Utilities for reading PDFs/TXT and chunking text.
"""

from io import BytesIO
from pathlib import Path
from typing import List
from PyPDF2 import PdfReader

async def pdf_to_text_fileobj(fileobj) -> str:
    data = BytesIO(await fileobj.read())
    reader = PdfReader(data)
    pages = []
    for p in reader.pages:
        pages.append(p.extract_text() or "")
    return "\n".join(pages)

def read_text_fileobj(fileobj) -> str:
    fileobj.file.seek(0)
    b = fileobj.file.read()
    if isinstance(b, bytes):
        return b.decode("utf-8", errors="ignore")
    return str(b)

def chunk_text(text: str, chunk_size: int = 1000, overlap: int = 200) -> List[str]:
    if not text:
        return []
    chunks = []
    start = 0
    L = len(text)
    while start < L:
        end = start + chunk_size
        chunk = text[start:end]
        chunks.append(chunk)
        start = max(end - overlap, end)
    return chunks