File size: 6,677 Bytes
5cb189e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
import os
import re
import json
import fitz
from PIL import Image
import pytesseract
import spacy
import gradio as gr

# --- Global Configuration and Initialization ---
# Load the spaCy model once globally
nlp = spacy.load("en_core_web_sm")

# On Hugging Face Spaces, Tesseract is usually in the PATH.
# If you encounter issues, you might need to specify the path, but generally not needed.
# pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract' # Example path for Linux

def extract_text_from_pdf(pdf_path):
    """Extracts text from a PDF file."""
    text = ""
    try:
        with fitz.open(pdf_path) as doc:
            for page in doc:
                text += page.get_text()
    except Exception as e:
        print(f"Error reading PDF {pdf_path}: {e}")
    return text

def extract_text_from_image(image_path):
    """Extracts text from an image file using OCR."""
    text = ""
    try:
        text = pytesseract.image_to_string(Image.open(image_path))
    except Exception as e:
        print(f"Error reading image {image_path}: {e}")
    return text

def parse_sections(text):
    """Splits the resume text into logical sections."""
    sections = {
        'contact_info': '',
        'experience': '',
        'education': '',
        'projects': '',
        'skills': '',
        'summary': ''
    }

    section_keywords = {
        'experience': [r'\bexperience\b', r'work history', r'professional experience'],
        'education': [r'\beducation\b'],
        'projects': [r'\bprojects\b', r'personal projects'],
        'skills': [r'\bskills\b', r'technical skills'],
        'summary': [r'\bsummary\b', r'profile', r'objective']
    }

    lines = text.split('\n')
    current_section = 'contact_info'

    for line in lines:
        if not line.strip():
            continue

        found_section = False
        for section, keywords in section_keywords.items():
            for keyword in keywords:
                if re.search(keyword, line, re.IGNORECASE):
                    current_section = section
                    found_section = True
                    break
            if found_section:
                break

        if current_section:
            sections[current_section] += line + '\n'

    return sections

def extract_accurate_information(text):
    """Extracts structured information from raw text using a section-based approach."""

    data = {
        "first_name": None, "middle_name": None, "last_name": None, "email": None,
        "phone": None, "major": None, "graduation_year": None,
        "experience_years": None, "experience": [], "project_names": [],
        "location": None
    }

    sections = parse_sections(text)
    contact_section = sections['contact_info']

    # Regex for email and Egyptian phone numbers
    email_regex = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
    phone_regex = r'\b(01[0125]\d{8})\b'

    data['email'] = re.search(email_regex, contact_section).group(0) if re.search(email_regex, contact_section) else None
    data['phone'] = re.search(phone_regex, contact_section).group(0) if re.search(phone_regex, contact_section) else None

    # Extract Name
    contact_lines = [line.strip() for line in contact_section.split('\n') if line.strip()]
    if contact_lines:
        full_name = contact_lines[0]
        if not data['email'] or data['email'] not in full_name:
            if not data['phone'] or data['phone'] not in full_name:
                name_parts = full_name.split()
                if len(name_parts) > 0:
                    data['first_name'] = name_parts[0]
                if len(name_parts) > 2:
                    data['middle_name'] = " ".join(name_parts[1:-1])
                    data['last_name'] = name_parts[-1]
                elif len(name_parts) == 2:
                    data['last_name'] = name_parts[1]

    # Extract Location using spaCy (globally loaded nlp object)
    doc = nlp(contact_section)
    for ent in doc.ents:
        if ent.label_ == "GPE":
            data["location"] = ent.text
            break

    # Education
    education_section = sections['education']
    if education_section:
        years = re.findall(r'\b(20\d{2})\b', education_section)
        if years:
            data['graduation_year'] = max([int(y) for y in years])

        for line in education_section.split('\n'):
            if "bachelor" in line.lower() or "business information system" in line.lower():
                data['major'] = line.strip()
                break

    # Experience
    experience_section = sections['experience']
    if experience_section:
        data['experience'] = [
            line.strip() for line in experience_section.split('\n')
            if line.strip() and not re.match(r'\bexperience\b', line, re.IGNORECASE)
        ]

    # Projects
    projects_section = sections['projects']
    if projects_section:
        project_lines = [
            line.strip() for line in projects_section.split('\n')
            if line.strip() and not re.match(r'\bprojects\b', line, re.IGNORECASE)
        ]
        data['project_names'] = [re.sub(r'^[•\-\*]\s*', '', line).strip('.') for line in project_lines]

    return data

def process_resume(file):
    """Gradio interface function to process an uploaded resume file."""
    if file is None:
        return "Please upload a resume file.", {}

    file_path = file.name # Gradio passes a NamedTemporaryFile object
    _, file_extension = os.path.splitext(file_path)
    text = ""

    if file_extension.lower() == ".pdf":
        text = extract_text_from_pdf(file_path)
    elif file_extension.lower() in [".png", ".jpg", ".jpeg", ".tiff"]:
        text = extract_text_from_image(file_path)
    else:
        return f"Unsupported file format: {file_extension}. Please upload a PDF or image file.", {}

    if text:
        extracted_data = extract_accurate_information(text)
        if extracted_data:
            return "Resume processed successfully!", json.dumps(extracted_data, indent=4)
    return "Failed to extract information from the resume. Please check the file format and content.", {}

# --- Gradio Interface ---
iface = gr.Interface(
    fn=process_resume,
    inputs=gr.File(type="filepath", label="Upload Resume (PDF or Image)"),
    outputs=[
        gr.Textbox(label="Status"),
        gr.Json(label="Extracted Data")
    ],
    title="Resume Parser",
    description="Upload a resume (PDF or image) to extract key information.",
    allow_flagging="never",
    examples=[
        # You can add example files here if you have them.
        # For example: "./examples/sample_resume.pdf"
    ]
)

if __name__ == "__main__":
    iface.launch()