File size: 6,677 Bytes
5cb189e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 |
import os
import re
import json
import fitz
from PIL import Image
import pytesseract
import spacy
import gradio as gr
# --- Global Configuration and Initialization ---
# Load the spaCy model once globally
nlp = spacy.load("en_core_web_sm")
# On Hugging Face Spaces, Tesseract is usually in the PATH.
# If you encounter issues, you might need to specify the path, but generally not needed.
# pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract' # Example path for Linux
def extract_text_from_pdf(pdf_path):
"""Extracts text from a PDF file."""
text = ""
try:
with fitz.open(pdf_path) as doc:
for page in doc:
text += page.get_text()
except Exception as e:
print(f"Error reading PDF {pdf_path}: {e}")
return text
def extract_text_from_image(image_path):
"""Extracts text from an image file using OCR."""
text = ""
try:
text = pytesseract.image_to_string(Image.open(image_path))
except Exception as e:
print(f"Error reading image {image_path}: {e}")
return text
def parse_sections(text):
"""Splits the resume text into logical sections."""
sections = {
'contact_info': '',
'experience': '',
'education': '',
'projects': '',
'skills': '',
'summary': ''
}
section_keywords = {
'experience': [r'\bexperience\b', r'work history', r'professional experience'],
'education': [r'\beducation\b'],
'projects': [r'\bprojects\b', r'personal projects'],
'skills': [r'\bskills\b', r'technical skills'],
'summary': [r'\bsummary\b', r'profile', r'objective']
}
lines = text.split('\n')
current_section = 'contact_info'
for line in lines:
if not line.strip():
continue
found_section = False
for section, keywords in section_keywords.items():
for keyword in keywords:
if re.search(keyword, line, re.IGNORECASE):
current_section = section
found_section = True
break
if found_section:
break
if current_section:
sections[current_section] += line + '\n'
return sections
def extract_accurate_information(text):
"""Extracts structured information from raw text using a section-based approach."""
data = {
"first_name": None, "middle_name": None, "last_name": None, "email": None,
"phone": None, "major": None, "graduation_year": None,
"experience_years": None, "experience": [], "project_names": [],
"location": None
}
sections = parse_sections(text)
contact_section = sections['contact_info']
# Regex for email and Egyptian phone numbers
email_regex = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
phone_regex = r'\b(01[0125]\d{8})\b'
data['email'] = re.search(email_regex, contact_section).group(0) if re.search(email_regex, contact_section) else None
data['phone'] = re.search(phone_regex, contact_section).group(0) if re.search(phone_regex, contact_section) else None
# Extract Name
contact_lines = [line.strip() for line in contact_section.split('\n') if line.strip()]
if contact_lines:
full_name = contact_lines[0]
if not data['email'] or data['email'] not in full_name:
if not data['phone'] or data['phone'] not in full_name:
name_parts = full_name.split()
if len(name_parts) > 0:
data['first_name'] = name_parts[0]
if len(name_parts) > 2:
data['middle_name'] = " ".join(name_parts[1:-1])
data['last_name'] = name_parts[-1]
elif len(name_parts) == 2:
data['last_name'] = name_parts[1]
# Extract Location using spaCy (globally loaded nlp object)
doc = nlp(contact_section)
for ent in doc.ents:
if ent.label_ == "GPE":
data["location"] = ent.text
break
# Education
education_section = sections['education']
if education_section:
years = re.findall(r'\b(20\d{2})\b', education_section)
if years:
data['graduation_year'] = max([int(y) for y in years])
for line in education_section.split('\n'):
if "bachelor" in line.lower() or "business information system" in line.lower():
data['major'] = line.strip()
break
# Experience
experience_section = sections['experience']
if experience_section:
data['experience'] = [
line.strip() for line in experience_section.split('\n')
if line.strip() and not re.match(r'\bexperience\b', line, re.IGNORECASE)
]
# Projects
projects_section = sections['projects']
if projects_section:
project_lines = [
line.strip() for line in projects_section.split('\n')
if line.strip() and not re.match(r'\bprojects\b', line, re.IGNORECASE)
]
data['project_names'] = [re.sub(r'^[•\-\*]\s*', '', line).strip('.') for line in project_lines]
return data
def process_resume(file):
"""Gradio interface function to process an uploaded resume file."""
if file is None:
return "Please upload a resume file.", {}
file_path = file.name # Gradio passes a NamedTemporaryFile object
_, file_extension = os.path.splitext(file_path)
text = ""
if file_extension.lower() == ".pdf":
text = extract_text_from_pdf(file_path)
elif file_extension.lower() in [".png", ".jpg", ".jpeg", ".tiff"]:
text = extract_text_from_image(file_path)
else:
return f"Unsupported file format: {file_extension}. Please upload a PDF or image file.", {}
if text:
extracted_data = extract_accurate_information(text)
if extracted_data:
return "Resume processed successfully!", json.dumps(extracted_data, indent=4)
return "Failed to extract information from the resume. Please check the file format and content.", {}
# --- Gradio Interface ---
iface = gr.Interface(
fn=process_resume,
inputs=gr.File(type="filepath", label="Upload Resume (PDF or Image)"),
outputs=[
gr.Textbox(label="Status"),
gr.Json(label="Extracted Data")
],
title="Resume Parser",
description="Upload a resume (PDF or image) to extract key information.",
allow_flagging="never",
examples=[
# You can add example files here if you have them.
# For example: "./examples/sample_resume.pdf"
]
)
if __name__ == "__main__":
iface.launch() |