Spaces:
Sleeping
Sleeping
| import sys | |
| import fitz # PyMuPDF | |
| import heapq | |
| def main(): | |
| if len(sys.argv) < 2: | |
| print("Usage: python list_largest_fonts.py <input.pdf>") | |
| sys.exit(1) | |
| doc = fitz.open(sys.argv[1]) | |
| # We will keep a heap of the top N largest items | |
| # Storing tuples of (size, page_num, text, font_name) | |
| # We use a list and sort it at the end for simplicity since N is small | |
| candidates = [] | |
| print(f"Scanning {len(doc)} pages...") | |
| for page in doc: | |
| blocks = page.get_text("dict")["blocks"] | |
| label = page.get_label() | |
| for b in blocks: | |
| if "lines" in b: | |
| for l in b["lines"]: | |
| for s in l["spans"]: | |
| text = s["text"].strip() | |
| if not text: | |
| continue | |
| # Add to candidates | |
| candidates.append({ | |
| "size": s["size"], | |
| "text": text[:50], # Truncate for display | |
| "page": page.number + 1, | |
| "label": label, | |
| "font": s["font"] | |
| }) | |
| # Sort descending by size | |
| candidates.sort(key=lambda x: x["size"], reverse=True) | |
| # Deduplicate based on (size, font) to avoid spamming the same header style | |
| # But we want to see different text instances. | |
| # Let's just show top 20 raw entries. | |
| print(f"\n--- TOP 25 LARGEST TEXT SPANS ---") | |
| print(f"{'SIZE (pt)':<10} {'IDX':<6} {'LABEL':<8} {'FONT':<25} {'TEXT'}") | |
| print("-" * 75) | |
| for c in candidates[:25]: | |
| print(f"{c['size']:<10.2f} {c['page']:<6} {c['label']:<8} {c['font']:<25} '{c['text']}'") | |
| if __name__ == "__main__": | |
| main() | |