Spaces:

danielhjerresen
/

PDF_Character_Counter

Sleeping

App Files Files Community

PDF_Character_Counter / pdf_counter.py

danielhjerresen

Upload 2 files

eeeaa67 verified about 1 month ago

Raw

History Blame Contribute Delete

9.15 kB

	# pdf_counter.py
	import re
	from collections import Counter
	import fitz


	# ============================================================
	# REGEX PATTERNS
	# ============================================================
	# These patterns are used to identify page numbers and
	# running headers that should not be counted as content.

	PAGE_NUMBER_RE = re.compile(
	r"^\s(side\s)?\d+\s(/\|af\|-)?\s\d\s$",
	re.IGNORECASE,
	)

	RUNNING_HEADER_RE = re.compile(
	r"^\d+(\.\d+)+\.?\s+.+\s+([ivxlcdm]+\|\d+)$",
	re.IGNORECASE,
	)


	# ============================================================
	# TEXT NORMALIZATION
	# ============================================================
	# Cleans extracted text by replacing multiple whitespace
	# characters (spaces, tabs, line breaks) with a single space.
	# This ensures consistent comparison and character counting.

	def normalize(text: str) -> str:
	return re.sub(r"\s+", " ", text).strip()


	# ============================================================
	# PDF EXTRACTION
	# ============================================================
	# Reads the PDF and extracts all text blocks from each page.
	#
	# For every block we store:
	# - Page number
	# - Original text
	# - Lowercase version for comparisons
	# - Vertical coordinates on the page
	# - Page height
	#
	# The position data is later used to detect headers/footers.

	def extract_pages(pdf_bytes: bytes):
	doc = fitz.open(stream=pdf_bytes, filetype="pdf")
	pages = []

	for page_no, page in enumerate(doc, start=1):
	blocks = []

	for block in page.get_text("blocks", sort=True):
	x0, y0, x1, y1, text, *_ = block

	text = normalize(text)

	if text:
	blocks.append({
	"page": page_no,
	"text": text,
	"text_key": text.lower(),
	"y0": y0,
	"y1": y1,
	"height": page.rect.height,
	})

	pages.append(blocks)

	return pages


	# ============================================================
	# PAGE NUMBER DETECTION
	# ============================================================
	# Checks whether a text block looks like a page number.

	def is_page_number(text: str) -> bool:
	return PAGE_NUMBER_RE.match(text) is not None


	# ============================================================
	# PAGE POSITION HELPERS
	# ============================================================
	# Determines whether a text block is located near the top
	# or bottom of the page.
	#
	# Top area = top 15%
	# Bottom area = bottom 15%
	#
	# These areas are where headers and footers are expected.

	def is_top_area(block: dict) -> bool:
	return block["y1"] <= block["height"] * 0.15


	def is_bottom_area(block: dict) -> bool:
	return block["y0"] >= block["height"] * 0.85


	# ============================================================
	# RUNNING HEADER DETECTION
	# ============================================================
	# Identifies chapter-style running headers such as:
	#
	# 2.1 Methods 12
	# 4.3 Results iv
	#
	# They typically appear near the top of each page and
	# follow a numbering pattern.
	#
	# "Chapter X" headings are excluded because they are often
	# actual content rather than page headers.

	def is_running_header(block: dict) -> bool:
	text = block["text"]

	if text.lower().startswith("chapter "):
	return False

	return is_top_area(block) and RUNNING_HEADER_RE.match(text) is not None


	# ============================================================
	# HEADER / FOOTER DETECTION
	# ============================================================
	# Finds text that appears repeatedly in the top or bottom
	# regions of many pages.
	#
	# Repeated top text -> header candidate
	# Repeated bottom text -> footer candidate
	#
	# A text must appear on at least min_ratio of pages before
	# it is classified as a header/footer.
	#
	# Default: 50% of pages.

	def detect_headers_and_footers(pages, min_ratio=0.5):
	header_counter = Counter()
	footer_counter = Counter()

	running_headers = set()
	page_numbers = set()

	for blocks in pages:
	headers_seen = set()
	footers_seen = set()

	for block in blocks:
	text = block["text"]
	text_key = block["text_key"]

	# Collect page numbers separately
	if is_page_number(text):
	page_numbers.add(text)
	continue

	# Collect running headers separately
	if is_running_header(block):
	running_headers.add(text)
	continue

	# Potential header candidate
	if is_top_area(block):
	headers_seen.add(text_key)

	# Potential footer candidate
	if is_bottom_area(block):
	footers_seen.add(text_key)

	# Count once per page
	header_counter.update(headers_seen)
	footer_counter.update(footers_seen)

	min_count = max(2, int(len(pages) * min_ratio))

	detected_headers = {
	text for text, count in header_counter.items()
	if count >= min_count
	}

	detected_footers = {
	text for text, count in footer_counter.items()
	if count >= min_count
	}

	return (
	detected_headers,
	detected_footers,
	running_headers,
	page_numbers,
	)


	# ============================================================
	# CHARACTER COUNTING ENGINE
	# ============================================================
	# Main workflow:
	#
	# 1. Extract all text blocks from the PDF.
	# 2. Detect repeated headers and footers.
	# 3. Detect page numbers.
	# 4. Remove unwanted elements.
	# 5. Count characters in remaining content.
	# 6. Return detailed results and diagnostics.

	def count_characters(
	pdf_bytes: bytes,
	excluded_pages: set[int] \| None = None,
	remove_headers: bool = True,
	remove_footers: bool = True,
	remove_page_numbers: bool = True,
	):
	excluded_pages = excluded_pages or set()

	# Extract all page data
	pages = extract_pages(pdf_bytes)

	# Detect recurring elements
	(
	detected_headers,
	detected_footers,
	running_headers,
	detected_page_numbers,
	) = detect_headers_and_footers(pages)

	included_text_parts = []
	page_results = []
	removed_items = []

	# Process each page individually
	for page_no, blocks in enumerate(pages, start=1):

	# Skip pages excluded by the user
	if page_no in excluded_pages:
	page_results.append({
	"Side": page_no,
	"Tegn": 0,
	"Status": "Fravalgt",
	})
	continue

	kept_text = []

	# Evaluate every text block
	for block in blocks:
	text = block["text"]
	text_key = block["text_key"]

	# Remove page numbers
	if remove_page_numbers and is_page_number(text):
	removed_items.append({
	"Side": page_no,
	"Type": "Sidetal",
	"Tekst": text,
	})
	continue

	# Remove repeated headers
	if remove_headers and text_key in detected_headers:
	removed_items.append({
	"Side": page_no,
	"Type": "Sidehoved",
	"Tekst": text,
	})
	continue

	# Remove running chapter headers
	if remove_headers and is_running_header(block):
	removed_items.append({
	"Side": page_no,
	"Type": "Løbende sidehoved",
	"Tekst": text,
	})
	continue

	# Remove repeated footers
	if remove_footers and text_key in detected_footers:
	removed_items.append({
	"Side": page_no,
	"Type": "Sidefod",
	"Tekst": text,
	})
	continue

	# Keep everything else
	kept_text.append(text)

	# Combine all remaining text on the page
	page_text = " ".join(kept_text)

	included_text_parts.append(page_text)

	# Store page statistics
	page_results.append({
	"Side": page_no,
	"Tegn": len(page_text),
	"Status": "Talt med",
	})

	# Combine text from all included pages
	full_text = " ".join(
	t for t in included_text_parts if t
	)

	# Return complete result package
	return {
	"total_characters": len(full_text),
	"page_results": page_results,
	"included_text": full_text,

	# Diagnostic information
	"detected_headers": sorted(detected_headers),
	"detected_footers": sorted(detected_footers),
	"detected_running_headers": sorted(running_headers),
	"detected_page_numbers": sorted(detected_page_numbers),

	# Log of removed items
	"removed_items": removed_items,

	# Total pages in document
	"page_count": len(pages),
	}