|
|
import os |
|
|
import re |
|
|
import json |
|
|
import fitz |
|
|
from PIL import Image |
|
|
import pytesseract |
|
|
import spacy |
|
|
import gradio as gr |
|
|
|
|
|
|
|
|
|
|
|
nlp = spacy.load("en_core_web_sm") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def extract_text_from_pdf(pdf_path): |
|
|
"""Extracts text from a PDF file.""" |
|
|
text = "" |
|
|
try: |
|
|
with fitz.open(pdf_path) as doc: |
|
|
for page in doc: |
|
|
text += page.get_text() |
|
|
except Exception as e: |
|
|
print(f"Error reading PDF {pdf_path}: {e}") |
|
|
return text |
|
|
|
|
|
def extract_text_from_image(image_path): |
|
|
"""Extracts text from an image file using OCR.""" |
|
|
text = "" |
|
|
try: |
|
|
text = pytesseract.image_to_string(Image.open(image_path)) |
|
|
except Exception as e: |
|
|
print(f"Error reading image {image_path}: {e}") |
|
|
return text |
|
|
|
|
|
def parse_sections(text): |
|
|
"""Splits the resume text into logical sections.""" |
|
|
sections = { |
|
|
'contact_info': '', |
|
|
'experience': '', |
|
|
'education': '', |
|
|
'projects': '', |
|
|
'skills': '', |
|
|
'summary': '' |
|
|
} |
|
|
|
|
|
section_keywords = { |
|
|
'experience': [r'\bexperience\b', r'work history', r'professional experience'], |
|
|
'education': [r'\beducation\b'], |
|
|
'projects': [r'\bprojects\b', r'personal projects'], |
|
|
'skills': [r'\bskills\b', r'technical skills'], |
|
|
'summary': [r'\bsummary\b', r'profile', r'objective'] |
|
|
} |
|
|
|
|
|
lines = text.split('\n') |
|
|
current_section = 'contact_info' |
|
|
|
|
|
for line in lines: |
|
|
if not line.strip(): |
|
|
continue |
|
|
|
|
|
found_section = False |
|
|
for section, keywords in section_keywords.items(): |
|
|
for keyword in keywords: |
|
|
if re.search(keyword, line, re.IGNORECASE): |
|
|
current_section = section |
|
|
found_section = True |
|
|
break |
|
|
if found_section: |
|
|
break |
|
|
|
|
|
if current_section: |
|
|
sections[current_section] += line + '\n' |
|
|
|
|
|
return sections |
|
|
|
|
|
def extract_accurate_information(text): |
|
|
"""Extracts structured information from raw text using a section-based approach.""" |
|
|
|
|
|
data = { |
|
|
"first_name": None, "middle_name": None, "last_name": None, "email": None, |
|
|
"phone": None, "major": None, "graduation_year": None, |
|
|
"experience_years": None, "experience": [], "project_names": [], |
|
|
"location": None |
|
|
} |
|
|
|
|
|
sections = parse_sections(text) |
|
|
contact_section = sections['contact_info'] |
|
|
|
|
|
|
|
|
email_regex = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b' |
|
|
phone_regex = r'\b(01[0125]\d{8})\b' |
|
|
|
|
|
data['email'] = re.search(email_regex, contact_section).group(0) if re.search(email_regex, contact_section) else None |
|
|
data['phone'] = re.search(phone_regex, contact_section).group(0) if re.search(phone_regex, contact_section) else None |
|
|
|
|
|
|
|
|
contact_lines = [line.strip() for line in contact_section.split('\n') if line.strip()] |
|
|
if contact_lines: |
|
|
full_name = contact_lines[0] |
|
|
if not data['email'] or data['email'] not in full_name: |
|
|
if not data['phone'] or data['phone'] not in full_name: |
|
|
name_parts = full_name.split() |
|
|
if len(name_parts) > 0: |
|
|
data['first_name'] = name_parts[0] |
|
|
if len(name_parts) > 2: |
|
|
data['middle_name'] = " ".join(name_parts[1:-1]) |
|
|
data['last_name'] = name_parts[-1] |
|
|
elif len(name_parts) == 2: |
|
|
data['last_name'] = name_parts[1] |
|
|
|
|
|
|
|
|
doc = nlp(contact_section) |
|
|
for ent in doc.ents: |
|
|
if ent.label_ == "GPE": |
|
|
data["location"] = ent.text |
|
|
break |
|
|
|
|
|
|
|
|
education_section = sections['education'] |
|
|
if education_section: |
|
|
years = re.findall(r'\b(20\d{2})\b', education_section) |
|
|
if years: |
|
|
data['graduation_year'] = max([int(y) for y in years]) |
|
|
|
|
|
for line in education_section.split('\n'): |
|
|
if "bachelor" in line.lower() or "business information system" in line.lower(): |
|
|
data['major'] = line.strip() |
|
|
break |
|
|
|
|
|
|
|
|
experience_section = sections['experience'] |
|
|
if experience_section: |
|
|
data['experience'] = [ |
|
|
line.strip() for line in experience_section.split('\n') |
|
|
if line.strip() and not re.match(r'\bexperience\b', line, re.IGNORECASE) |
|
|
] |
|
|
|
|
|
|
|
|
projects_section = sections['projects'] |
|
|
if projects_section: |
|
|
project_lines = [ |
|
|
line.strip() for line in projects_section.split('\n') |
|
|
if line.strip() and not re.match(r'\bprojects\b', line, re.IGNORECASE) |
|
|
] |
|
|
data['project_names'] = [re.sub(r'^[•\-\*]\s*', '', line).strip('.') for line in project_lines] |
|
|
|
|
|
return data |
|
|
|
|
|
def process_resume(file): |
|
|
"""Gradio interface function to process an uploaded resume file.""" |
|
|
if file is None: |
|
|
return "Please upload a resume file.", {} |
|
|
|
|
|
file_path = file.name |
|
|
_, file_extension = os.path.splitext(file_path) |
|
|
text = "" |
|
|
|
|
|
if file_extension.lower() == ".pdf": |
|
|
text = extract_text_from_pdf(file_path) |
|
|
elif file_extension.lower() in [".png", ".jpg", ".jpeg", ".tiff"]: |
|
|
text = extract_text_from_image(file_path) |
|
|
else: |
|
|
return f"Unsupported file format: {file_extension}. Please upload a PDF or image file.", {} |
|
|
|
|
|
if text: |
|
|
extracted_data = extract_accurate_information(text) |
|
|
if extracted_data: |
|
|
return "Resume processed successfully!", json.dumps(extracted_data, indent=4) |
|
|
return "Failed to extract information from the resume. Please check the file format and content.", {} |
|
|
|
|
|
|
|
|
iface = gr.Interface( |
|
|
fn=process_resume, |
|
|
inputs=gr.File(type="filepath", label="Upload Resume (PDF or Image)"), |
|
|
outputs=[ |
|
|
gr.Textbox(label="Status"), |
|
|
gr.Json(label="Extracted Data") |
|
|
], |
|
|
title="Resume Parser", |
|
|
description="Upload a resume (PDF or image) to extract key information.", |
|
|
allow_flagging="never", |
|
|
examples=[ |
|
|
|
|
|
|
|
|
] |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
iface.launch() |