import sys
import fitz  # PyMuPDF
import heapq

def main():
    if len(sys.argv) < 2:
        print("Usage: python list_largest_fonts.py <input.pdf>")
        sys.exit(1)

    doc = fitz.open(sys.argv[1])
    
    # We will keep a heap of the top N largest items
    # Storing tuples of (size, page_num, text, font_name)
    # We use a list and sort it at the end for simplicity since N is small
    candidates = []

    print(f"Scanning {len(doc)} pages...")

    for page in doc:
        blocks = page.get_text("dict")["blocks"]
        label = page.get_label()
        for b in blocks:
            if "lines" in b:
                for l in b["lines"]:
                    for s in l["spans"]:
                        text = s["text"].strip()
                        if not text:
                            continue
                        
                        # Add to candidates
                        candidates.append({
                            "size": s["size"],
                            "text": text[:50], # Truncate for display
                            "page": page.number + 1,
                            "label": label,
                            "font": s["font"]
                        })

    # Sort descending by size
    candidates.sort(key=lambda x: x["size"], reverse=True)

    # Deduplicate based on (size, font) to avoid spamming the same header style
    # But we want to see different text instances. 
    # Let's just show top 20 raw entries.
    
    print(f"\n--- TOP 25 LARGEST TEXT SPANS ---")
    print(f"{'SIZE (pt)':<10} {'IDX':<6} {'LABEL':<8} {'FONT':<25} {'TEXT'}")
    print("-" * 75)
    
    for c in candidates[:25]:
        print(f"{c['size']:<10.2f} {c['page']:<6} {c['label']:<8} {c['font']:<25} '{c['text']}'")

if __name__ == "__main__":
    main()