Legal-AI-Help-Agent / data /extract_clean_text.py
JARVISXIRONMAN's picture
Create data/extract_clean_text.py
78275c8 verified
raw
history blame contribute delete
498 Bytes
import fitz # PyMuPDF
# πŸ“„ List of PDFs to extract
pdf_files = [
("data/rent_act.pdf", "data/rent_act_clean.txt"),
("data/contract_act.pdf", "data/contract_act_clean.txt"),
]
for pdf_path, txt_path in pdf_files:
doc = fitz.open(pdf_path)
with open(txt_path, "w", encoding="utf-8") as f:
for page in doc:
text = page.get_text()
f.write(text)
f.write("\n--- PAGE BREAK ---\n")
print(f"βœ… Extraction done! {txt_path} is ready.")