import sys import fitz # PyMuPDF import heapq def main(): if len(sys.argv) < 2: print("Usage: python list_largest_fonts.py ") sys.exit(1) doc = fitz.open(sys.argv[1]) # We will keep a heap of the top N largest items # Storing tuples of (size, page_num, text, font_name) # We use a list and sort it at the end for simplicity since N is small candidates = [] print(f"Scanning {len(doc)} pages...") for page in doc: blocks = page.get_text("dict")["blocks"] label = page.get_label() for b in blocks: if "lines" in b: for l in b["lines"]: for s in l["spans"]: text = s["text"].strip() if not text: continue # Add to candidates candidates.append({ "size": s["size"], "text": text[:50], # Truncate for display "page": page.number + 1, "label": label, "font": s["font"] }) # Sort descending by size candidates.sort(key=lambda x: x["size"], reverse=True) # Deduplicate based on (size, font) to avoid spamming the same header style # But we want to see different text instances. # Let's just show top 20 raw entries. print(f"\n--- TOP 25 LARGEST TEXT SPANS ---") print(f"{'SIZE (pt)':<10} {'IDX':<6} {'LABEL':<8} {'FONT':<25} {'TEXT'}") print("-" * 75) for c in candidates[:25]: print(f"{c['size']:<10.2f} {c['page']:<6} {c['label']:<8} {c['font']:<25} '{c['text']}'") if __name__ == "__main__": main()