File size: 1,536 Bytes
8447d06
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
'''module for data-related functionalities'''
import os
import pymupdf

def change_pdf_files(path:str)-> None:
    """
    	Function for renaming supplied pdf files to more
        programming-friendly way (removing whitespace). 
        Note: pdfs present in github were already processed using this
        function
    Args:
    	path - path to the pdf dir
    """
    pdf_list = os.listdir(path)
    for pdf in pdf_list:
        if pdf.endswith('.pdf'):
            new_name = pdf.replace(' ', '_').lower()
            old_path = os.path.join(path, pdf)
            new_path = os.path.join(path, new_name)
            os.rename(old_path, new_path)

def extract_txt_from_pdf(path:str, pages:list=None) -> str:
    """
    	Function for extracting text from pdf. NOTE: needs testing
        and more controlled text extraction (at the moment ALL text getting extracted)
    Args:
    	path - path to the pdf to be read
        pages - optional, list of pages to be read
	Outs:
	    the extracted text in str format
    """
    doc = pymupdf.open(path)
    number_of_pages = doc.page_count
    page_texts = []
    if pages is None:
        for i in range(number_of_pages):
            page = doc[i]
            text = page.get_text()
            page_texts.append(text)
    else:
        for i in pages:
            page = doc[i]
            text = page.get_text()
            page_texts.append(text)
    return ''.join(page_texts)

def main():
    '''main function'''
    change_pdf_files('data')

if __name__=='__main__':
    main()