taxdoc-preprocessor / extractor.py
iamnew123's picture
Update extractor.py
0d76c8b verified
raw
history blame contribute delete
448 Bytes
import fitz # PyMuPDF
import os
def extract_text(file):
if not file:
return ""
file_ext = os.path.splitext(file.name)[1].lower()
if file_ext == ".pdf":
with fitz.open(file.name) as doc:
return "\n".join([page.get_text() for page in doc])
elif file_ext == ".txt":
with open(file.name, "r", encoding="utf-8") as f:
return f.read()
else:
return "Unsupported file type"