import sys import fitz # PyMuPDF def main(): if len(sys.argv) < 3: print("Usage: python find_preceding.py \"\"") sys.exit(1) pdf_path = sys.argv[1] anchor = sys.argv[2] doc = fitz.open(pdf_path) prev_span = None prev_page_num = -1 found = False print(f"Searching for anchor containing: '{anchor}'") for page in doc: blocks = page.get_text("dict")["blocks"] for b in blocks: if "lines" not in b: continue for l in b["lines"]: for s in l["spans"]: current_text = s["text"] if anchor in current_text: print(f"\n[!] MATCH FOUND on Page {page.number + 1}") print(f" Anchor Span Text: '{current_text.strip()}'") if prev_span: print(f"\n--- PRECEDING ELEMENT DETAILS ---") print(f"Text: '{prev_span['text']}'") print(f"Page: {prev_page_num}") print(f"Font: {prev_span['font']}") print(f"Size: {prev_span['size']:.4f}") print(f"Color: {hex(prev_span['color'])}") print(f"BBox: {prev_span['bbox']}") print(f"Flags: {prev_span['flags']}") # Helper for recipe creation print(f"\n--- SUGGESTED RECIPE FILTER ---") print(f"[[heading]]") print(f"font.name = \"{prev_span['font']}\"") print(f"font.size = {prev_span['size']}") else: print("\n[!] No preceding text element found (this might be the first element).") found = True # Update tracker prev_span = s prev_page_num = page.number + 1 if not found: print(f"\nAnchor text '{anchor}' not found in document.") if __name__ == "__main__": main()