Spaces:
Sleeping
Sleeping
| import fitz # PyMuPDF | |
| # π List of PDFs to extract | |
| pdf_files = [ | |
| ("data/rent_act.pdf", "data/rent_act_clean.txt"), | |
| ("data/contract_act.pdf", "data/contract_act_clean.txt"), | |
| ] | |
| for pdf_path, txt_path in pdf_files: | |
| doc = fitz.open(pdf_path) | |
| with open(txt_path, "w", encoding="utf-8") as f: | |
| for page in doc: | |
| text = page.get_text() | |
| f.write(text) | |
| f.write("\n--- PAGE BREAK ---\n") | |
| print(f"β Extraction done! {txt_path} is ready.") | |