Spaces:
Sleeping
Sleeping
| from __future__ import annotations | |
| import io | |
| import re | |
| from typing import BinaryIO | |
| import pdfplumber | |
| from app.models.schemas import PageContent | |
| _HEADER_FOOTER_PATTERN = re.compile( | |
| r"^(?:page\s*\d+|halaman\s*\d+|\d+\s*(?:of|dari)\s*\d+).*$", | |
| re.IGNORECASE | re.MULTILINE, | |
| ) | |
| # Complexity: Time O(p) | Space O(p) — p = page count | |
| def extract_pages(file_bytes: bytes, filename: str) -> list[PageContent]: | |
| stream = io.BytesIO(file_bytes) | |
| return _extract_from_stream(stream) | |
| def _extract_from_stream(stream: BinaryIO) -> list[PageContent]: | |
| pages: list[PageContent] = [] | |
| with pdfplumber.open(stream) as pdf: | |
| for page_num, page in enumerate(pdf.pages, start=1): | |
| raw_text = page.extract_text() or "" | |
| cleaned = _clean_page_text(raw_text) | |
| if cleaned.strip(): | |
| pages.append(PageContent(page_number=page_num, text=cleaned)) | |
| return pages | |
| def _clean_page_text(text: str) -> str: | |
| text = _HEADER_FOOTER_PATTERN.sub("", text) | |
| text = re.sub(r"\n{3,}", "\n\n", text) | |
| return text.strip() | |