import pandas as pd import numpy as np import fitz import docx class BaseParser(): def __init__(self, file_name: str) -> None: self.file_name = file_name self.file_path = f'./{file_name}' self.file_type = file_name.split('.')[-1] def fitz_pymupdf_parser(self): doc = fitz.open(self.file_path) text = "" for page in doc: text += page.get_text().encode("utf-8", "ignore").decode("utf-8", "ignore") print(text) return text def docx_parser(self): docs = docx.Document(self.file_path) text = "" for paragraph in docs.paragraphs: text += paragraph.text + "\n" print(text) return text def parse_pdf(self): parsed_text = "" if self.file_type == "txt": parsed_text = open(self.file_name, "r").read() elif self.file_type == "pdf": parsed_text = self.fitz_pymupdf_parser() elif self.file_type == "docx": parsed_text = self.docx_parser() return parsed_text