pdf.tocgen.split / utils /list_longest_fonts.py
adelevett's picture
Upload 76 files
046e3b8 verified
import sys
import fitz # PyMuPDF
import heapq
def main():
if len(sys.argv) < 2:
print("Usage: python list_largest_fonts.py <input.pdf>")
sys.exit(1)
doc = fitz.open(sys.argv[1])
# We will keep a heap of the top N largest items
# Storing tuples of (size, page_num, text, font_name)
# We use a list and sort it at the end for simplicity since N is small
candidates = []
print(f"Scanning {len(doc)} pages...")
for page in doc:
blocks = page.get_text("dict")["blocks"]
label = page.get_label()
for b in blocks:
if "lines" in b:
for l in b["lines"]:
for s in l["spans"]:
text = s["text"].strip()
if not text:
continue
# Add to candidates
candidates.append({
"size": s["size"],
"text": text[:50], # Truncate for display
"page": page.number + 1,
"label": label,
"font": s["font"]
})
# Sort descending by size
candidates.sort(key=lambda x: x["size"], reverse=True)
# Deduplicate based on (size, font) to avoid spamming the same header style
# But we want to see different text instances.
# Let's just show top 20 raw entries.
print(f"\n--- TOP 25 LARGEST TEXT SPANS ---")
print(f"{'SIZE (pt)':<10} {'IDX':<6} {'LABEL':<8} {'FONT':<25} {'TEXT'}")
print("-" * 75)
for c in candidates[:25]:
print(f"{c['size']:<10.2f} {c['page']:<6} {c['label']:<8} {c['font']:<25} '{c['text']}'")
if __name__ == "__main__":
main()