File size: 498 Bytes
78275c8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
import fitz  # PyMuPDF

# 📄 List of PDFs to extract
pdf_files = [
    ("data/rent_act.pdf", "data/rent_act_clean.txt"),
    ("data/contract_act.pdf", "data/contract_act_clean.txt"),
]

for pdf_path, txt_path in pdf_files:
    doc = fitz.open(pdf_path)
    with open(txt_path, "w", encoding="utf-8") as f:
        for page in doc:
            text = page.get_text()
            f.write(text)
            f.write("\n--- PAGE BREAK ---\n")
    print(f"✅ Extraction done! {txt_path} is ready.")