usef143 commited on
Commit
5cb189e
·
verified ·
1 Parent(s): 3ad3c7f

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +192 -0
app.py ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import json
4
+ import fitz
5
+ from PIL import Image
6
+ import pytesseract
7
+ import spacy
8
+ import gradio as gr
9
+
10
+ # --- Global Configuration and Initialization ---
11
+ # Load the spaCy model once globally
12
+ nlp = spacy.load("en_core_web_sm")
13
+
14
+ # On Hugging Face Spaces, Tesseract is usually in the PATH.
15
+ # If you encounter issues, you might need to specify the path, but generally not needed.
16
+ # pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract' # Example path for Linux
17
+
18
+ def extract_text_from_pdf(pdf_path):
19
+ """Extracts text from a PDF file."""
20
+ text = ""
21
+ try:
22
+ with fitz.open(pdf_path) as doc:
23
+ for page in doc:
24
+ text += page.get_text()
25
+ except Exception as e:
26
+ print(f"Error reading PDF {pdf_path}: {e}")
27
+ return text
28
+
29
+ def extract_text_from_image(image_path):
30
+ """Extracts text from an image file using OCR."""
31
+ text = ""
32
+ try:
33
+ text = pytesseract.image_to_string(Image.open(image_path))
34
+ except Exception as e:
35
+ print(f"Error reading image {image_path}: {e}")
36
+ return text
37
+
38
+ def parse_sections(text):
39
+ """Splits the resume text into logical sections."""
40
+ sections = {
41
+ 'contact_info': '',
42
+ 'experience': '',
43
+ 'education': '',
44
+ 'projects': '',
45
+ 'skills': '',
46
+ 'summary': ''
47
+ }
48
+
49
+ section_keywords = {
50
+ 'experience': [r'\bexperience\b', r'work history', r'professional experience'],
51
+ 'education': [r'\beducation\b'],
52
+ 'projects': [r'\bprojects\b', r'personal projects'],
53
+ 'skills': [r'\bskills\b', r'technical skills'],
54
+ 'summary': [r'\bsummary\b', r'profile', r'objective']
55
+ }
56
+
57
+ lines = text.split('\n')
58
+ current_section = 'contact_info'
59
+
60
+ for line in lines:
61
+ if not line.strip():
62
+ continue
63
+
64
+ found_section = False
65
+ for section, keywords in section_keywords.items():
66
+ for keyword in keywords:
67
+ if re.search(keyword, line, re.IGNORECASE):
68
+ current_section = section
69
+ found_section = True
70
+ break
71
+ if found_section:
72
+ break
73
+
74
+ if current_section:
75
+ sections[current_section] += line + '\n'
76
+
77
+ return sections
78
+
79
+ def extract_accurate_information(text):
80
+ """Extracts structured information from raw text using a section-based approach."""
81
+
82
+ data = {
83
+ "first_name": None, "middle_name": None, "last_name": None, "email": None,
84
+ "phone": None, "major": None, "graduation_year": None,
85
+ "experience_years": None, "experience": [], "project_names": [],
86
+ "location": None
87
+ }
88
+
89
+ sections = parse_sections(text)
90
+ contact_section = sections['contact_info']
91
+
92
+ # Regex for email and Egyptian phone numbers
93
+ email_regex = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
94
+ phone_regex = r'\b(01[0125]\d{8})\b'
95
+
96
+ data['email'] = re.search(email_regex, contact_section).group(0) if re.search(email_regex, contact_section) else None
97
+ data['phone'] = re.search(phone_regex, contact_section).group(0) if re.search(phone_regex, contact_section) else None
98
+
99
+ # Extract Name
100
+ contact_lines = [line.strip() for line in contact_section.split('\n') if line.strip()]
101
+ if contact_lines:
102
+ full_name = contact_lines[0]
103
+ if not data['email'] or data['email'] not in full_name:
104
+ if not data['phone'] or data['phone'] not in full_name:
105
+ name_parts = full_name.split()
106
+ if len(name_parts) > 0:
107
+ data['first_name'] = name_parts[0]
108
+ if len(name_parts) > 2:
109
+ data['middle_name'] = " ".join(name_parts[1:-1])
110
+ data['last_name'] = name_parts[-1]
111
+ elif len(name_parts) == 2:
112
+ data['last_name'] = name_parts[1]
113
+
114
+ # Extract Location using spaCy (globally loaded nlp object)
115
+ doc = nlp(contact_section)
116
+ for ent in doc.ents:
117
+ if ent.label_ == "GPE":
118
+ data["location"] = ent.text
119
+ break
120
+
121
+ # Education
122
+ education_section = sections['education']
123
+ if education_section:
124
+ years = re.findall(r'\b(20\d{2})\b', education_section)
125
+ if years:
126
+ data['graduation_year'] = max([int(y) for y in years])
127
+
128
+ for line in education_section.split('\n'):
129
+ if "bachelor" in line.lower() or "business information system" in line.lower():
130
+ data['major'] = line.strip()
131
+ break
132
+
133
+ # Experience
134
+ experience_section = sections['experience']
135
+ if experience_section:
136
+ data['experience'] = [
137
+ line.strip() for line in experience_section.split('\n')
138
+ if line.strip() and not re.match(r'\bexperience\b', line, re.IGNORECASE)
139
+ ]
140
+
141
+ # Projects
142
+ projects_section = sections['projects']
143
+ if projects_section:
144
+ project_lines = [
145
+ line.strip() for line in projects_section.split('\n')
146
+ if line.strip() and not re.match(r'\bprojects\b', line, re.IGNORECASE)
147
+ ]
148
+ data['project_names'] = [re.sub(r'^[•\-\*]\s*', '', line).strip('.') for line in project_lines]
149
+
150
+ return data
151
+
152
+ def process_resume(file):
153
+ """Gradio interface function to process an uploaded resume file."""
154
+ if file is None:
155
+ return "Please upload a resume file.", {}
156
+
157
+ file_path = file.name # Gradio passes a NamedTemporaryFile object
158
+ _, file_extension = os.path.splitext(file_path)
159
+ text = ""
160
+
161
+ if file_extension.lower() == ".pdf":
162
+ text = extract_text_from_pdf(file_path)
163
+ elif file_extension.lower() in [".png", ".jpg", ".jpeg", ".tiff"]:
164
+ text = extract_text_from_image(file_path)
165
+ else:
166
+ return f"Unsupported file format: {file_extension}. Please upload a PDF or image file.", {}
167
+
168
+ if text:
169
+ extracted_data = extract_accurate_information(text)
170
+ if extracted_data:
171
+ return "Resume processed successfully!", json.dumps(extracted_data, indent=4)
172
+ return "Failed to extract information from the resume. Please check the file format and content.", {}
173
+
174
+ # --- Gradio Interface ---
175
+ iface = gr.Interface(
176
+ fn=process_resume,
177
+ inputs=gr.File(type="filepath", label="Upload Resume (PDF or Image)"),
178
+ outputs=[
179
+ gr.Textbox(label="Status"),
180
+ gr.Json(label="Extracted Data")
181
+ ],
182
+ title="Resume Parser",
183
+ description="Upload a resume (PDF or image) to extract key information.",
184
+ allow_flagging="never",
185
+ examples=[
186
+ # You can add example files here if you have them.
187
+ # For example: "./examples/sample_resume.pdf"
188
+ ]
189
+ )
190
+
191
+ if __name__ == "__main__":
192
+ iface.launch()