Spaces:
Sleeping
Sleeping
| import sys | |
| import fitz # PyMuPDF | |
| def main(): | |
| if len(sys.argv) < 3: | |
| print("Usage: python find_preceding.py <input.pdf> \"<anchor_text>\"") | |
| sys.exit(1) | |
| pdf_path = sys.argv[1] | |
| anchor = sys.argv[2] | |
| doc = fitz.open(pdf_path) | |
| prev_span = None | |
| prev_page_num = -1 | |
| found = False | |
| print(f"Searching for anchor containing: '{anchor}'") | |
| for page in doc: | |
| blocks = page.get_text("dict")["blocks"] | |
| for b in blocks: | |
| if "lines" not in b: continue | |
| for l in b["lines"]: | |
| for s in l["spans"]: | |
| current_text = s["text"] | |
| if anchor in current_text: | |
| print(f"\n[!] MATCH FOUND on Page {page.number + 1}") | |
| print(f" Anchor Span Text: '{current_text.strip()}'") | |
| if prev_span: | |
| print(f"\n--- PRECEDING ELEMENT DETAILS ---") | |
| print(f"Text: '{prev_span['text']}'") | |
| print(f"Page: {prev_page_num}") | |
| print(f"Font: {prev_span['font']}") | |
| print(f"Size: {prev_span['size']:.4f}") | |
| print(f"Color: {hex(prev_span['color'])}") | |
| print(f"BBox: {prev_span['bbox']}") | |
| print(f"Flags: {prev_span['flags']}") | |
| # Helper for recipe creation | |
| print(f"\n--- SUGGESTED RECIPE FILTER ---") | |
| print(f"[[heading]]") | |
| print(f"font.name = \"{prev_span['font']}\"") | |
| print(f"font.size = {prev_span['size']}") | |
| else: | |
| print("\n[!] No preceding text element found (this might be the first element).") | |
| found = True | |
| # Update tracker | |
| prev_span = s | |
| prev_page_num = page.number + 1 | |
| if not found: | |
| print(f"\nAnchor text '{anchor}' not found in document.") | |
| if __name__ == "__main__": | |
| main() | |