| |
| import re |
| from collections import Counter |
| import fitz |
|
|
|
|
| |
| |
| |
| |
| |
|
|
| PAGE_NUMBER_RE = re.compile( |
| r"^\s*(side\s*)?\d+\s*(/|af|-)?\s*\d*\s*$", |
| re.IGNORECASE, |
| ) |
|
|
| RUNNING_HEADER_RE = re.compile( |
| r"^\d+(\.\d+)+\.?\s+.+\s+([ivxlcdm]+|\d+)$", |
| re.IGNORECASE, |
| ) |
|
|
|
|
| |
| |
| |
| |
| |
| |
|
|
| def normalize(text: str) -> str: |
| return re.sub(r"\s+", " ", text).strip() |
|
|
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| def extract_pages(pdf_bytes: bytes): |
| doc = fitz.open(stream=pdf_bytes, filetype="pdf") |
| pages = [] |
|
|
| for page_no, page in enumerate(doc, start=1): |
| blocks = [] |
|
|
| for block in page.get_text("blocks", sort=True): |
| x0, y0, x1, y1, text, *_ = block |
|
|
| text = normalize(text) |
|
|
| if text: |
| blocks.append({ |
| "page": page_no, |
| "text": text, |
| "text_key": text.lower(), |
| "y0": y0, |
| "y1": y1, |
| "height": page.rect.height, |
| }) |
|
|
| pages.append(blocks) |
|
|
| return pages |
|
|
|
|
| |
| |
| |
| |
|
|
| def is_page_number(text: str) -> bool: |
| return PAGE_NUMBER_RE.match(text) is not None |
|
|
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| def is_top_area(block: dict) -> bool: |
| return block["y1"] <= block["height"] * 0.15 |
|
|
|
|
| def is_bottom_area(block: dict) -> bool: |
| return block["y0"] >= block["height"] * 0.85 |
|
|
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| def is_running_header(block: dict) -> bool: |
| text = block["text"] |
|
|
| if text.lower().startswith("chapter "): |
| return False |
|
|
| return is_top_area(block) and RUNNING_HEADER_RE.match(text) is not None |
|
|
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| def detect_headers_and_footers(pages, min_ratio=0.5): |
| header_counter = Counter() |
| footer_counter = Counter() |
|
|
| running_headers = set() |
| page_numbers = set() |
|
|
| for blocks in pages: |
| headers_seen = set() |
| footers_seen = set() |
|
|
| for block in blocks: |
| text = block["text"] |
| text_key = block["text_key"] |
|
|
| |
| if is_page_number(text): |
| page_numbers.add(text) |
| continue |
|
|
| |
| if is_running_header(block): |
| running_headers.add(text) |
| continue |
|
|
| |
| if is_top_area(block): |
| headers_seen.add(text_key) |
|
|
| |
| if is_bottom_area(block): |
| footers_seen.add(text_key) |
|
|
| |
| header_counter.update(headers_seen) |
| footer_counter.update(footers_seen) |
|
|
| min_count = max(2, int(len(pages) * min_ratio)) |
|
|
| detected_headers = { |
| text for text, count in header_counter.items() |
| if count >= min_count |
| } |
|
|
| detected_footers = { |
| text for text, count in footer_counter.items() |
| if count >= min_count |
| } |
|
|
| return ( |
| detected_headers, |
| detected_footers, |
| running_headers, |
| page_numbers, |
| ) |
|
|
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| def count_characters( |
| pdf_bytes: bytes, |
| excluded_pages: set[int] | None = None, |
| remove_headers: bool = True, |
| remove_footers: bool = True, |
| remove_page_numbers: bool = True, |
| ): |
| excluded_pages = excluded_pages or set() |
|
|
| |
| pages = extract_pages(pdf_bytes) |
|
|
| |
| ( |
| detected_headers, |
| detected_footers, |
| running_headers, |
| detected_page_numbers, |
| ) = detect_headers_and_footers(pages) |
|
|
| included_text_parts = [] |
| page_results = [] |
| removed_items = [] |
|
|
| |
| for page_no, blocks in enumerate(pages, start=1): |
|
|
| |
| if page_no in excluded_pages: |
| page_results.append({ |
| "Side": page_no, |
| "Tegn": 0, |
| "Status": "Fravalgt", |
| }) |
| continue |
|
|
| kept_text = [] |
|
|
| |
| for block in blocks: |
| text = block["text"] |
| text_key = block["text_key"] |
|
|
| |
| if remove_page_numbers and is_page_number(text): |
| removed_items.append({ |
| "Side": page_no, |
| "Type": "Sidetal", |
| "Tekst": text, |
| }) |
| continue |
|
|
| |
| if remove_headers and text_key in detected_headers: |
| removed_items.append({ |
| "Side": page_no, |
| "Type": "Sidehoved", |
| "Tekst": text, |
| }) |
| continue |
|
|
| |
| if remove_headers and is_running_header(block): |
| removed_items.append({ |
| "Side": page_no, |
| "Type": "Løbende sidehoved", |
| "Tekst": text, |
| }) |
| continue |
|
|
| |
| if remove_footers and text_key in detected_footers: |
| removed_items.append({ |
| "Side": page_no, |
| "Type": "Sidefod", |
| "Tekst": text, |
| }) |
| continue |
|
|
| |
| kept_text.append(text) |
|
|
| |
| page_text = " ".join(kept_text) |
|
|
| included_text_parts.append(page_text) |
|
|
| |
| page_results.append({ |
| "Side": page_no, |
| "Tegn": len(page_text), |
| "Status": "Talt med", |
| }) |
|
|
| |
| full_text = " ".join( |
| t for t in included_text_parts if t |
| ) |
|
|
| |
| return { |
| "total_characters": len(full_text), |
| "page_results": page_results, |
| "included_text": full_text, |
|
|
| |
| "detected_headers": sorted(detected_headers), |
| "detected_footers": sorted(detected_footers), |
| "detected_running_headers": sorted(running_headers), |
| "detected_page_numbers": sorted(detected_page_numbers), |
|
|
| |
| "removed_items": removed_items, |
|
|
| |
| "page_count": len(pages), |
| } |