Spaces:
Runtime error
Runtime error
File size: 1,096 Bytes
b0716cb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 |
import pandas as pd
import numpy as np
import fitz
import docx
class BaseParser():
def __init__(self, file_name: str) -> None:
self.file_name = file_name
self.file_path = f'./{file_name}'
self.file_type = file_name.split('.')[-1]
def fitz_pymupdf_parser(self):
doc = fitz.open(self.file_path)
text = ""
for page in doc:
text += page.get_text().encode("utf-8", "ignore").decode("utf-8", "ignore")
print(text)
return text
def docx_parser(self):
docs = docx.Document(self.file_path)
text = ""
for paragraph in docs.paragraphs:
text += paragraph.text + "\n"
print(text)
return text
def parse_pdf(self):
parsed_text = ""
if self.file_type == "txt":
parsed_text = open(self.file_name, "r").read()
elif self.file_type == "pdf":
parsed_text = self.fitz_pymupdf_parser()
elif self.file_type == "docx":
parsed_text = self.docx_parser()
return parsed_text
|