Spaces:
Runtime error
Runtime error
| '''module for data-related functionalities''' | |
| import os | |
| import pymupdf | |
| def change_pdf_files(path:str)-> None: | |
| """ | |
| Function for renaming supplied pdf files to more | |
| programming-friendly way (removing whitespace). | |
| Note: pdfs present in github were already processed using this | |
| function | |
| Args: | |
| path - path to the pdf dir | |
| """ | |
| pdf_list = os.listdir(path) | |
| for pdf in pdf_list: | |
| if pdf.endswith('.pdf'): | |
| new_name = pdf.replace(' ', '_').lower() | |
| old_path = os.path.join(path, pdf) | |
| new_path = os.path.join(path, new_name) | |
| os.rename(old_path, new_path) | |
| def extract_txt_from_pdf(path:str, pages:list=None) -> str: | |
| """ | |
| Function for extracting text from pdf. NOTE: needs testing | |
| and more controlled text extraction (at the moment ALL text getting extracted) | |
| Args: | |
| path - path to the pdf to be read | |
| pages - optional, list of pages to be read | |
| Outs: | |
| the extracted text in str format | |
| """ | |
| doc = pymupdf.open(path) | |
| number_of_pages = doc.page_count | |
| page_texts = [] | |
| if pages is None: | |
| for i in range(number_of_pages): | |
| page = doc[i] | |
| text = page.get_text() | |
| page_texts.append(text) | |
| else: | |
| for i in pages: | |
| page = doc[i] | |
| text = page.get_text() | |
| page_texts.append(text) | |
| return ''.join(page_texts) | |
| def main(): | |
| '''main function''' | |
| change_pdf_files('data') | |
| if __name__=='__main__': | |
| main() | |