Spaces:

adelevett
/

pdf.tocgen.split

Sleeping

pdf.tocgen.split / utils /list_longest_fonts.py

Upload 76 files

046e3b8 verified about 1 month ago

1.86 kB

	import sys
	import fitz # PyMuPDF
	import heapq

	def main():
	if len(sys.argv) < 2:
	print("Usage: python list_largest_fonts.py <input.pdf>")
	sys.exit(1)

	doc = fitz.open(sys.argv[1])

	# We will keep a heap of the top N largest items
	# Storing tuples of (size, page_num, text, font_name)
	# We use a list and sort it at the end for simplicity since N is small
	candidates = []

	print(f"Scanning {len(doc)} pages...")

	for page in doc:
	blocks = page.get_text("dict")["blocks"]
	label = page.get_label()
	for b in blocks:
	if "lines" in b:
	for l in b["lines"]:
	for s in l["spans"]:
	text = s["text"].strip()
	if not text:
	continue

	# Add to candidates
	candidates.append({
	"size": s["size"],
	"text": text[:50], # Truncate for display
	"page": page.number + 1,
	"label": label,
	"font": s["font"]
	})

	# Sort descending by size
	candidates.sort(key=lambda x: x["size"], reverse=True)

	# Deduplicate based on (size, font) to avoid spamming the same header style
	# But we want to see different text instances.
	# Let's just show top 20 raw entries.

	print(f"\n--- TOP 25 LARGEST TEXT SPANS ---")
	print(f"{'SIZE (pt)':<10} {'IDX':<6} {'LABEL':<8} {'FONT':<25} {'TEXT'}")
	print("-" * 75)

	for c in candidates[:25]:
	print(f"{c['size']:<10.2f} {c['page']:<6} {c['label']:<8} {c['font']:<25} '{c['text']}'")

	if __name__ == "__main__":
	main()