falconstreamlit / file_utils.py
eaglelandsonce's picture
Update file_utils.py
fd2a616 verified
raw
history blame contribute delete
658 Bytes
import fitz # PyMuPDF
import docx
"""
def extract_text_from_pdf(pdf_file):
text = ""
with fitz.open(stream=pdf_file.read(), filetype="pdf") as doc:
for page in doc:
text += page.get_text()
return text
"""
def extract_text_from_pdf(file):
pdf_document = fitz.open(stream=file.read(), filetype="pdf")
text = ""
for page_num in range(len(pdf_document)):
page = pdf_document.load_page(page_num)
text += page.get_text()
return text
def extract_text_from_word(doc_file):
text = ""
doc = docx.Document(doc_file)
for para in doc.paragraphs:
text += para.text
return text