File size: 1,096 Bytes
b0716cb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import pandas as pd
import numpy as np
import fitz
import docx

class BaseParser():
    def __init__(self,  file_name: str) -> None:
        self.file_name = file_name
        self.file_path = f'./{file_name}'
        self.file_type = file_name.split('.')[-1]

    def fitz_pymupdf_parser(self):
        doc = fitz.open(self.file_path)
        text = ""

        for page in doc:
            text += page.get_text().encode("utf-8", "ignore").decode("utf-8", "ignore")

        print(text)
        return text
    
    def docx_parser(self):
        docs = docx.Document(self.file_path)
        text = ""
        
        for paragraph in docs.paragraphs:
            text +=  paragraph.text + "\n"
        
        print(text)
        return text

    def parse_pdf(self):
        parsed_text = ""
        if self.file_type == "txt":
            parsed_text = open(self.file_name, "r").read()
        elif self.file_type == "pdf":
            parsed_text = self.fitz_pymupdf_parser()
        elif self.file_type == "docx":
            parsed_text = self.docx_parser()

        return parsed_text