cv-assesment-tools / core /parser /file_parser.py
Jayra Ortiz
:star: added initial working architecture
b0716cb
import pandas as pd
import numpy as np
import fitz
import docx
class BaseParser():
def __init__(self, file_name: str) -> None:
self.file_name = file_name
self.file_path = f'./{file_name}'
self.file_type = file_name.split('.')[-1]
def fitz_pymupdf_parser(self):
doc = fitz.open(self.file_path)
text = ""
for page in doc:
text += page.get_text().encode("utf-8", "ignore").decode("utf-8", "ignore")
print(text)
return text
def docx_parser(self):
docs = docx.Document(self.file_path)
text = ""
for paragraph in docs.paragraphs:
text += paragraph.text + "\n"
print(text)
return text
def parse_pdf(self):
parsed_text = ""
if self.file_type == "txt":
parsed_text = open(self.file_name, "r").read()
elif self.file_type == "pdf":
parsed_text = self.fitz_pymupdf_parser()
elif self.file_type == "docx":
parsed_text = self.docx_parser()
return parsed_text