Spaces:
Runtime error
Runtime error
File size: 1,536 Bytes
8447d06 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 |
'''module for data-related functionalities'''
import os
import pymupdf
def change_pdf_files(path:str)-> None:
"""
Function for renaming supplied pdf files to more
programming-friendly way (removing whitespace).
Note: pdfs present in github were already processed using this
function
Args:
path - path to the pdf dir
"""
pdf_list = os.listdir(path)
for pdf in pdf_list:
if pdf.endswith('.pdf'):
new_name = pdf.replace(' ', '_').lower()
old_path = os.path.join(path, pdf)
new_path = os.path.join(path, new_name)
os.rename(old_path, new_path)
def extract_txt_from_pdf(path:str, pages:list=None) -> str:
"""
Function for extracting text from pdf. NOTE: needs testing
and more controlled text extraction (at the moment ALL text getting extracted)
Args:
path - path to the pdf to be read
pages - optional, list of pages to be read
Outs:
the extracted text in str format
"""
doc = pymupdf.open(path)
number_of_pages = doc.page_count
page_texts = []
if pages is None:
for i in range(number_of_pages):
page = doc[i]
text = page.get_text()
page_texts.append(text)
else:
for i in pages:
page = doc[i]
text = page.get_text()
page_texts.append(text)
return ''.join(page_texts)
def main():
'''main function'''
change_pdf_files('data')
if __name__=='__main__':
main()
|