from __future__ import annotations import io import re from typing import BinaryIO import pdfplumber from app.models.schemas import PageContent _HEADER_FOOTER_PATTERN = re.compile( r"^(?:page\s*\d+|halaman\s*\d+|\d+\s*(?:of|dari)\s*\d+).*$", re.IGNORECASE | re.MULTILINE, ) # Complexity: Time O(p) | Space O(p) — p = page count def extract_pages(file_bytes: bytes, filename: str) -> list[PageContent]: stream = io.BytesIO(file_bytes) return _extract_from_stream(stream) def _extract_from_stream(stream: BinaryIO) -> list[PageContent]: pages: list[PageContent] = [] with pdfplumber.open(stream) as pdf: for page_num, page in enumerate(pdf.pages, start=1): raw_text = page.extract_text() or "" cleaned = _clean_page_text(raw_text) if cleaned.strip(): pages.append(PageContent(page_number=page_num, text=cleaned)) return pages def _clean_page_text(text: str) -> str: text = _HEADER_FOOTER_PATTERN.sub("", text) text = re.sub(r"\n{3,}", "\n\n", text) return text.strip()