Spaces:
No application file
No application file
File size: 1,497 Bytes
ea68259 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 |
import io
from typing import List
from PyPDF2 import PdfReader
def extract_text_from_pdf(file_bytes: bytes) -> str:
"""
Extracts text from a PDF file.
Args:
file_bytes: Raw bytes of the uploaded PDF.
Returns:
A single string containing all extracted text.
"""
reader = PdfReader(io.BytesIO(file_bytes))
text = ""
for page in reader.pages:
extracted = page.extract_text()
if extracted:
text += extracted + "\n"
return text.strip()
def chunk_text(text: str, max_chars: int = 2000, overlap: int = 200) -> List[str]:
"""
Splits a long text into overlapping chunks to fit model context and
make retrieval easier for RAG-style Q&A.
Args:
text: Full document text.
max_chars: Maximum characters per chunk.
overlap: Overlap between consecutive chunks.
Returns:
List of text chunks.
"""
chunks: List[str] = []
start = 0
length = len(text)
while start < length:
end = min(start + max_chars, length)
chunk = text[start:end]
# Try to end at a sentence boundary when possible
last_period = chunk.rfind(".")
if last_period != -1 and end != length:
end = start + last_period + 1
chunk = text[start:end]
chunk = chunk.strip()
if chunk:
chunks.append(chunk)
# Move window forward with overlap
start = max(0, end - overlap)
return chunks
|