Spaces:
Runtime error
Runtime error
| import pandas as pd | |
| import numpy as np | |
| import fitz | |
| import docx | |
| class BaseParser(): | |
| def __init__(self, file_name: str) -> None: | |
| self.file_name = file_name | |
| self.file_path = f'./{file_name}' | |
| self.file_type = file_name.split('.')[-1] | |
| def fitz_pymupdf_parser(self): | |
| doc = fitz.open(self.file_path) | |
| text = "" | |
| for page in doc: | |
| text += page.get_text().encode("utf-8", "ignore").decode("utf-8", "ignore") | |
| print(text) | |
| return text | |
| def docx_parser(self): | |
| docs = docx.Document(self.file_path) | |
| text = "" | |
| for paragraph in docs.paragraphs: | |
| text += paragraph.text + "\n" | |
| print(text) | |
| return text | |
| def parse_pdf(self): | |
| parsed_text = "" | |
| if self.file_type == "txt": | |
| parsed_text = open(self.file_name, "r").read() | |
| elif self.file_type == "pdf": | |
| parsed_text = self.fitz_pymupdf_parser() | |
| elif self.file_type == "docx": | |
| parsed_text = self.docx_parser() | |
| return parsed_text | |