HR_Model_CV_Scoring / resume_parser.py
mahmodGendy's picture
Upload 6 files
903a1b0 verified
import os
import re
import tempfile
import pytesseract
import PyPDF2
import docx
from PIL import Image
from pdf2image import convert_from_path
def clean_text(text):
if not text:
return ""
text = str(text)
text = re.sub(r"\s+", " ", text)
return text.strip()
def extract_text_from_image(file_path):
try:
img = Image.open(file_path)
text = pytesseract.image_to_string(img)
return clean_text(text)
except:
return ""
def extract_text_from_docx(file_path):
try:
doc = docx.Document(file_path)
text = " ".join([p.text for p in doc.paragraphs])
return clean_text(text)
except:
return ""
def extract_text_from_pdf(file_path):
text = ""
try:
with open(file_path, "rb") as f:
reader = PyPDF2.PdfReader(f)
for page in reader.pages:
page_text = page.extract_text()
if page_text:
text += page_text + " "
except:
pass
if len(text.strip()) < 100:
try:
images = convert_from_path(file_path)
for image in images:
with tempfile.NamedTemporaryFile(suffix=".png") as tmp:
image.save(tmp.name)
text += extract_text_from_image(tmp.name)
except:
pass
return clean_text(text)
def parse_resume(file_path):
ext = os.path.splitext(file_path)[1].lower()
if ext == ".pdf":
return extract_text_from_pdf(file_path)
if ext in [".docx", ".doc"]:
return extract_text_from_docx(file_path)
if ext in [".png", ".jpg", ".jpeg"]:
return extract_text_from_image(file_path)
return ""