Spaces:
Running
Running
File size: 1,079 Bytes
91f7410 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 | from __future__ import annotations
import io
import re
from typing import BinaryIO
import pdfplumber
from app.models.schemas import PageContent
_HEADER_FOOTER_PATTERN = re.compile(
r"^(?:page\s*\d+|halaman\s*\d+|\d+\s*(?:of|dari)\s*\d+).*$",
re.IGNORECASE | re.MULTILINE,
)
# Complexity: Time O(p) | Space O(p) — p = page count
def extract_pages(file_bytes: bytes, filename: str) -> list[PageContent]:
stream = io.BytesIO(file_bytes)
return _extract_from_stream(stream)
def _extract_from_stream(stream: BinaryIO) -> list[PageContent]:
pages: list[PageContent] = []
with pdfplumber.open(stream) as pdf:
for page_num, page in enumerate(pdf.pages, start=1):
raw_text = page.extract_text() or ""
cleaned = _clean_page_text(raw_text)
if cleaned.strip():
pages.append(PageContent(page_number=page_num, text=cleaned))
return pages
def _clean_page_text(text: str) -> str:
text = _HEADER_FOOTER_PATTERN.sub("", text)
text = re.sub(r"\n{3,}", "\n\n", text)
return text.strip()
|