Spaces:
No application file
No application file
| import io | |
| from typing import List | |
| from PyPDF2 import PdfReader | |
| def extract_text_from_pdf(file_bytes: bytes) -> str: | |
| """ | |
| Extracts text from a PDF file. | |
| Args: | |
| file_bytes: Raw bytes of the uploaded PDF. | |
| Returns: | |
| A single string containing all extracted text. | |
| """ | |
| reader = PdfReader(io.BytesIO(file_bytes)) | |
| text = "" | |
| for page in reader.pages: | |
| extracted = page.extract_text() | |
| if extracted: | |
| text += extracted + "\n" | |
| return text.strip() | |
| def chunk_text(text: str, max_chars: int = 2000, overlap: int = 200) -> List[str]: | |
| """ | |
| Splits a long text into overlapping chunks to fit model context and | |
| make retrieval easier for RAG-style Q&A. | |
| Args: | |
| text: Full document text. | |
| max_chars: Maximum characters per chunk. | |
| overlap: Overlap between consecutive chunks. | |
| Returns: | |
| List of text chunks. | |
| """ | |
| chunks: List[str] = [] | |
| start = 0 | |
| length = len(text) | |
| while start < length: | |
| end = min(start + max_chars, length) | |
| chunk = text[start:end] | |
| # Try to end at a sentence boundary when possible | |
| last_period = chunk.rfind(".") | |
| if last_period != -1 and end != length: | |
| end = start + last_period + 1 | |
| chunk = text[start:end] | |
| chunk = chunk.strip() | |
| if chunk: | |
| chunks.append(chunk) | |
| # Move window forward with overlap | |
| start = max(0, end - overlap) | |
| return chunks | |