Spaces:
Paused
Paused
Commit
·
775c09c
1
Parent(s):
ff62567
updated
Browse files- backend/services/resume_parser.py +30 -277
backend/services/resume_parser.py
CHANGED
|
@@ -1,95 +1,21 @@
|
|
| 1 |
-
"""
|
| 2 |
-
resume_parser.py
|
| 3 |
-
=================
|
| 4 |
-
|
| 5 |
-
This module provides lightweight functions to extract useful information
|
| 6 |
-
from a candidate's resume. The design avoids heavy dependencies such
|
| 7 |
-
as spaCy or pdfminer because Hugging Face Spaces environments are
|
| 8 |
-
resource‑constrained and installing additional packages at runtime is
|
| 9 |
-
often not feasible. Instead, built‑in Python libraries and a
|
| 10 |
-
few simple heuristics are used to extract text from both PDF and DOCX
|
| 11 |
-
files and to infer the candidate's name, skills, education and
|
| 12 |
-
experience from that text.
|
| 13 |
-
|
| 14 |
-
The parser operates on the assumption that most resumes follow a
|
| 15 |
-
relatively consistent structure: the candidate's name appears near the
|
| 16 |
-
top of the document, headings such as "Education" and "Experience"
|
| 17 |
-
demarcate sections, and common skill keywords are scattered
|
| 18 |
-
throughout. These assumptions will not hold for every CV, but they
|
| 19 |
-
provide a reasonable baseline for auto‑filling form fields. Users can
|
| 20 |
-
always edit the populated fields before submitting their application.
|
| 21 |
-
|
| 22 |
-
Functions
|
| 23 |
-
---------
|
| 24 |
-
|
| 25 |
-
* ``extract_text(file_path: str) -> str``
|
| 26 |
-
Read a resume file (PDF or DOCX) and return its plain text. PDFs
|
| 27 |
-
are processed using the ``pdftotext`` command line tool, which is
|
| 28 |
-
available in the Hugging Face Spaces container. DOCX files are
|
| 29 |
-
treated as zip archives; the ``word/document.xml`` component is
|
| 30 |
-
parsed and stripped of XML tags.
|
| 31 |
-
|
| 32 |
-
* ``extract_name(text: str, filename: str) -> str``
|
| 33 |
-
Attempt to infer the candidate's full name from the document text.
|
| 34 |
-
If no plausible name is found in the first few lines of the text,
|
| 35 |
-
fall back to deriving a name from the file name itself.
|
| 36 |
-
|
| 37 |
-
* ``extract_skills(text: str) -> list[str]``
|
| 38 |
-
Search for a predefined list of common technical and soft skills
|
| 39 |
-
within the resume text. Matches are case‑insensitive and unique
|
| 40 |
-
values are returned in their original capitalisation.
|
| 41 |
-
|
| 42 |
-
* ``extract_education(text: str) -> list[str]``
|
| 43 |
-
Identify lines mentioning educational qualifications. Heuristics
|
| 44 |
-
include the presence of keywords like "University", "Bachelor",
|
| 45 |
-
"Master", "PhD", etc.
|
| 46 |
-
|
| 47 |
-
* ``extract_experience(text: str) -> list[str]``
|
| 48 |
-
Extract statements describing work experience. Lines containing
|
| 49 |
-
keywords such as "experience", "Developer", "Engineer" or those
|
| 50 |
-
matching patterns with years of service are considered.
|
| 51 |
-
|
| 52 |
-
* ``parse_resume(file_path: str, filename: str) -> dict``
|
| 53 |
-
High‑level wrapper that orchestrates the text extraction and
|
| 54 |
-
information extraction functions. Returns a dictionary with keys
|
| 55 |
-
``name``, ``skills``, ``education``, and ``experience``.
|
| 56 |
-
|
| 57 |
-
The main Flask route can import ``parse_resume`` from this module and
|
| 58 |
-
return its result as JSON. Because the heuristics are conservative and
|
| 59 |
-
string‑based, the parser runs quickly on both CPU and GPU hosts.
|
| 60 |
-
"""
|
| 61 |
-
|
| 62 |
from __future__ import annotations
|
| 63 |
-
|
| 64 |
import os
|
| 65 |
import re
|
| 66 |
import subprocess
|
| 67 |
import zipfile
|
| 68 |
from typing import List
|
|
|
|
| 69 |
|
|
|
|
|
|
|
| 70 |
|
| 71 |
def extract_text(file_path: str) -> str:
|
| 72 |
-
"""Extract
|
| 73 |
-
|
| 74 |
-
Parameters
|
| 75 |
-
----------
|
| 76 |
-
file_path : str
|
| 77 |
-
Absolute path to the uploaded resume.
|
| 78 |
-
|
| 79 |
-
Returns
|
| 80 |
-
-------
|
| 81 |
-
str
|
| 82 |
-
The textual content of the resume. If extraction fails,
|
| 83 |
-
returns an empty string.
|
| 84 |
-
"""
|
| 85 |
if not file_path or not os.path.isfile(file_path):
|
| 86 |
return ""
|
| 87 |
|
| 88 |
lower_name = file_path.lower()
|
| 89 |
try:
|
| 90 |
-
# If the file ends with .pdf use pdftotext. The '-layout'
|
| 91 |
-
# flag preserves relative positioning which helps preserve
|
| 92 |
-
# line breaks in the output. Output is sent to stdout.
|
| 93 |
if lower_name.endswith('.pdf'):
|
| 94 |
try:
|
| 95 |
result = subprocess.run(
|
|
@@ -98,244 +24,71 @@ def extract_text(file_path: str) -> str:
|
|
| 98 |
stderr=subprocess.PIPE,
|
| 99 |
check=False
|
| 100 |
)
|
| 101 |
-
|
| 102 |
-
# Normalize whitespace and ensure section keywords are on separate lines
|
| 103 |
-
raw_text = re.sub(r'(?i)(education)', r'\n\1\n', raw_text)
|
| 104 |
-
raw_text = re.sub(r'(?i)(experience)', r'\n\1\n', raw_text)
|
| 105 |
-
raw_text = re.sub(r'(?i)(skills?)', r'\n\1\n', raw_text)
|
| 106 |
-
# Replace multiple spaces/tabs but keep newlines
|
| 107 |
-
raw_text = re.sub(r'[ \t]+', ' ', raw_text)
|
| 108 |
-
# Ensure section keywords are isolated
|
| 109 |
-
raw_text = re.sub(r'(?i)(education)', r'\n\1\n', raw_text)
|
| 110 |
-
raw_text = re.sub(r'(?i)(experience)', r'\n\1\n', raw_text)
|
| 111 |
-
raw_text = re.sub(r'(?i)(skills?)', r'\n\1\n', raw_text)
|
| 112 |
-
return raw_text
|
| 113 |
-
|
| 114 |
-
|
| 115 |
except Exception:
|
| 116 |
return ""
|
| 117 |
-
# If it's a .docx treat it as a zip archive and pull the main
|
| 118 |
-
# document XML. Note that .doc files are not supported since
|
| 119 |
-
# they use a binary format.
|
| 120 |
elif lower_name.endswith('.docx'):
|
| 121 |
try:
|
| 122 |
with zipfile.ZipFile(file_path) as zf:
|
| 123 |
with zf.open('word/document.xml') as docx_xml:
|
| 124 |
xml_bytes = docx_xml.read()
|
| 125 |
-
# Remove XML tags to leave plain text. Replace
|
| 126 |
-
# tags with spaces to avoid accidental word
|
| 127 |
-
# concatenation.
|
| 128 |
xml_text = xml_bytes.decode('utf-8', errors='ignore')
|
| 129 |
-
# Replace common markup elements with newlines to
|
| 130 |
-
# preserve paragraph structure. Some tags like
|
| 131 |
-
# ``<w:p>`` represent paragraphs in Word.
|
| 132 |
xml_text = re.sub(r'<w:p[^>]*>', '\n', xml_text, flags=re.I)
|
| 133 |
-
# Remove remaining tags
|
| 134 |
text = re.sub(r'<[^>]+>', ' ', xml_text)
|
| 135 |
-
# Collapse multiple whitespace
|
| 136 |
text = re.sub(r'\s+', ' ', text)
|
| 137 |
return text
|
| 138 |
except Exception:
|
| 139 |
return ""
|
| 140 |
else:
|
| 141 |
-
# Unsupported file type
|
| 142 |
return ""
|
| 143 |
except Exception:
|
| 144 |
return ""
|
| 145 |
|
| 146 |
-
|
| 147 |
def extract_name(text: str, filename: str) -> str:
|
| 148 |
-
"""
|
| 149 |
-
|
| 150 |
-
This function first inspects the first few lines of the resume
|
| 151 |
-
text. It looks for lines containing between two and four words
|
| 152 |
-
where each word starts with an uppercase letter. If such a line
|
| 153 |
-
isn't found, it falls back to deriving a name from the file name.
|
| 154 |
-
|
| 155 |
-
Parameters
|
| 156 |
-
----------
|
| 157 |
-
text : str
|
| 158 |
-
The full resume text.
|
| 159 |
-
filename : str
|
| 160 |
-
The original filename of the uploaded resume.
|
| 161 |
-
|
| 162 |
-
Returns
|
| 163 |
-
-------
|
| 164 |
-
str
|
| 165 |
-
Inferred full name or an empty string if not found.
|
| 166 |
-
"""
|
| 167 |
if text:
|
| 168 |
-
# Consider the first 10 lines for a potential name. Strip
|
| 169 |
-
# whitespace and ignore empty lines.
|
| 170 |
lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
|
| 171 |
for line in lines[:10]:
|
| 172 |
-
# Remove common headings like "Resume" or "Curriculum Vitae"
|
| 173 |
if re.match(r'(?i)resume|curriculum vitae', line):
|
| 174 |
continue
|
| 175 |
words = line.split()
|
| 176 |
-
# A plausible name typically has 2–4 words
|
| 177 |
if 1 < len(words) <= 4:
|
| 178 |
-
# All words must start with an uppercase letter (allow
|
| 179 |
-
# accented characters) and contain at least one letter.
|
| 180 |
if all(re.match(r'^[A-ZÀ-ÖØ-Þ][\w\-]*', w) for w in words):
|
| 181 |
return line
|
| 182 |
-
# Fallback: derive a name from the filename
|
| 183 |
base = os.path.basename(filename)
|
| 184 |
-
# Remove extension
|
| 185 |
base = re.sub(r'\.(pdf|docx|doc)$', '', base, flags=re.I)
|
| 186 |
-
# Replace underscores, dashes and dots with spaces
|
| 187 |
base = re.sub(r'[\._-]+', ' ', base)
|
| 188 |
-
# Remove common tokens like 'cv' or 'resume'
|
| 189 |
base = re.sub(r'(?i)\b(cv|resume)\b', '', base)
|
| 190 |
base = re.sub(r'\s+', ' ', base).strip()
|
| 191 |
-
# Title case the remaining string
|
| 192 |
return base.title() if base else ''
|
| 193 |
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
capitalisation where possible.
|
| 213 |
-
"""
|
| 214 |
-
if not text:
|
| 215 |
-
return []
|
| 216 |
-
lower_text = text.lower()
|
| 217 |
-
# Define a set of common technical and soft skills. This list can
|
| 218 |
-
# be extended in future iterations without modifying the parser
|
| 219 |
-
SKILLS = [
|
| 220 |
-
'python', 'java', 'c++', 'c', 'javascript', 'html', 'css',
|
| 221 |
-
'react', 'node', 'angular', 'vue', 'django', 'flask', 'spring',
|
| 222 |
-
'machine learning', 'deep learning', 'nlp', 'data analysis',
|
| 223 |
-
'data science', 'sql', 'mysql', 'postgresql', 'mongodb', 'git',
|
| 224 |
-
'docker', 'kubernetes', 'aws', 'azure', 'gcp', 'linux',
|
| 225 |
-
'tensorflow', 'pytorch', 'scikit-learn', 'pandas', 'numpy',
|
| 226 |
-
'matplotlib', 'excel', 'powerpoint', 'project management',
|
| 227 |
-
'communication', 'teamwork', 'leadership', 'problem solving',
|
| 228 |
-
'public speaking', 'writing', 'analysis', 'time management'
|
| 229 |
-
]
|
| 230 |
-
found = []
|
| 231 |
-
for skill in SKILLS:
|
| 232 |
-
pattern = re.escape(skill.lower())
|
| 233 |
-
if re.search(r'\b' + pattern + r'(\b|[^a-zA-Z])', lower_text):
|
| 234 |
-
# Preserve the original capitalisation of the skill phrase
|
| 235 |
-
found.append(skill.title() if skill.islower() else skill)
|
| 236 |
-
return list(dict.fromkeys(found)) # Remove duplicates, preserve order
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
def extract_education(text: str) -> List[str]:
|
| 240 |
-
"""Gather educational qualifications from the resume text.
|
| 241 |
-
|
| 242 |
-
The function searches for lines containing keywords related to
|
| 243 |
-
education. Only distinct lines with meaningful content are
|
| 244 |
-
included.
|
| 245 |
-
|
| 246 |
-
Parameters
|
| 247 |
-
----------
|
| 248 |
-
text : str
|
| 249 |
-
|
| 250 |
-
Returns
|
| 251 |
-
-------
|
| 252 |
-
list[str]
|
| 253 |
-
Lines representing educational qualifications.
|
| 254 |
-
"""
|
| 255 |
-
if not text:
|
| 256 |
-
return []
|
| 257 |
-
lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
|
| 258 |
-
education_keywords = [
|
| 259 |
-
'university', 'college', 'bachelor', 'bachelors', 'master', 'masters',
|
| 260 |
-
'phd', 'b.sc', 'bsc', 'm.sc', 'msc', 'mba', 'school', 'degree',
|
| 261 |
-
'diploma', 'engineering', 'work history'
|
| 262 |
-
]
|
| 263 |
-
|
| 264 |
-
results = []
|
| 265 |
-
for line in lines:
|
| 266 |
-
lower = line.lower()
|
| 267 |
-
if any(kw in lower for kw in education_keywords):
|
| 268 |
-
# Avoid capturing the same line twice
|
| 269 |
-
if line not in results:
|
| 270 |
-
results.append(line)
|
| 271 |
-
# If nothing found, return an empty list
|
| 272 |
-
return results
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
def extract_experience(text: str) -> List[str]:
|
| 276 |
-
"""Extract snippets of work experience from resume text.
|
| 277 |
-
|
| 278 |
-
Heuristics are used to detect sentences or lines that likely
|
| 279 |
-
describe professional experience. Indicators include the presence
|
| 280 |
-
of keywords like "experience", job titles, or explicit durations.
|
| 281 |
-
|
| 282 |
-
Parameters
|
| 283 |
-
----------
|
| 284 |
-
text : str
|
| 285 |
-
|
| 286 |
-
Returns
|
| 287 |
-
-------
|
| 288 |
-
list[str]
|
| 289 |
-
A list of lines summarising work experience.
|
| 290 |
-
"""
|
| 291 |
-
if not text:
|
| 292 |
-
return []
|
| 293 |
-
lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
|
| 294 |
-
# Keywords signalling experience entries
|
| 295 |
-
exp_keywords = [
|
| 296 |
-
'experience', 'worked', 'employment', 'internship', 'developer',
|
| 297 |
-
'engineer', 'manager', 'analyst', 'consultant', 'assistant',
|
| 298 |
-
'years', 'year', 'months', 'month', 'present'
|
| 299 |
-
]
|
| 300 |
-
results = []
|
| 301 |
-
for line in lines:
|
| 302 |
-
lower = line.lower()
|
| 303 |
-
if any(kw in lower for kw in exp_keywords):
|
| 304 |
-
# Filter out lines that are just section headings
|
| 305 |
-
if len(lower.split()) > 2:
|
| 306 |
-
if line not in results:
|
| 307 |
-
results.append(line)
|
| 308 |
-
return results
|
| 309 |
-
|
| 310 |
|
| 311 |
def parse_resume(file_path: str, filename: str) -> dict:
|
| 312 |
-
"""
|
| 313 |
-
|
| 314 |
-
Parameters
|
| 315 |
-
----------
|
| 316 |
-
file_path : str
|
| 317 |
-
Location of the uploaded file on disk.
|
| 318 |
-
filename : str
|
| 319 |
-
The original filename as provided by the user. Used as a
|
| 320 |
-
fallback for name extraction if the document text does not
|
| 321 |
-
reveal a plausible name.
|
| 322 |
-
|
| 323 |
-
Returns
|
| 324 |
-
-------
|
| 325 |
-
dict
|
| 326 |
-
Dictionary with keys ``name``, ``skills``, ``education`` and
|
| 327 |
-
``experience``. Each value is a string, except for the name
|
| 328 |
-
which is a single string. Lists are joined into a comma or
|
| 329 |
-
newline separated string suitable for form fields.
|
| 330 |
-
"""
|
| 331 |
text = extract_text(file_path)
|
| 332 |
name = extract_name(text, filename)
|
| 333 |
-
|
| 334 |
-
education_list = extract_education(text)
|
| 335 |
-
experience_list = extract_experience(text)
|
| 336 |
return {
|
| 337 |
'name': name or '',
|
| 338 |
-
'skills': ', '.join(
|
| 339 |
-
'education': '
|
| 340 |
-
'experience': '
|
| 341 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from __future__ import annotations
|
|
|
|
| 2 |
import os
|
| 3 |
import re
|
| 4 |
import subprocess
|
| 5 |
import zipfile
|
| 6 |
from typing import List
|
| 7 |
+
from transformers import pipeline
|
| 8 |
|
| 9 |
+
# Load the NER model for resume parsing
|
| 10 |
+
ner = pipeline("ner", model="AI-Sweden-Models/distilbert-resume-ner", aggregation_strategy="simple")
|
| 11 |
|
| 12 |
def extract_text(file_path: str) -> str:
|
| 13 |
+
"""Extract text from PDF or DOCX."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
if not file_path or not os.path.isfile(file_path):
|
| 15 |
return ""
|
| 16 |
|
| 17 |
lower_name = file_path.lower()
|
| 18 |
try:
|
|
|
|
|
|
|
|
|
|
| 19 |
if lower_name.endswith('.pdf'):
|
| 20 |
try:
|
| 21 |
result = subprocess.run(
|
|
|
|
| 24 |
stderr=subprocess.PIPE,
|
| 25 |
check=False
|
| 26 |
)
|
| 27 |
+
return result.stdout.decode('utf-8', errors='ignore')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
except Exception:
|
| 29 |
return ""
|
|
|
|
|
|
|
|
|
|
| 30 |
elif lower_name.endswith('.docx'):
|
| 31 |
try:
|
| 32 |
with zipfile.ZipFile(file_path) as zf:
|
| 33 |
with zf.open('word/document.xml') as docx_xml:
|
| 34 |
xml_bytes = docx_xml.read()
|
|
|
|
|
|
|
|
|
|
| 35 |
xml_text = xml_bytes.decode('utf-8', errors='ignore')
|
|
|
|
|
|
|
|
|
|
| 36 |
xml_text = re.sub(r'<w:p[^>]*>', '\n', xml_text, flags=re.I)
|
|
|
|
| 37 |
text = re.sub(r'<[^>]+>', ' ', xml_text)
|
|
|
|
| 38 |
text = re.sub(r'\s+', ' ', text)
|
| 39 |
return text
|
| 40 |
except Exception:
|
| 41 |
return ""
|
| 42 |
else:
|
|
|
|
| 43 |
return ""
|
| 44 |
except Exception:
|
| 45 |
return ""
|
| 46 |
|
|
|
|
| 47 |
def extract_name(text: str, filename: str) -> str:
|
| 48 |
+
"""Extract candidate's name from text or filename."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
if text:
|
|
|
|
|
|
|
| 50 |
lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
|
| 51 |
for line in lines[:10]:
|
|
|
|
| 52 |
if re.match(r'(?i)resume|curriculum vitae', line):
|
| 53 |
continue
|
| 54 |
words = line.split()
|
|
|
|
| 55 |
if 1 < len(words) <= 4:
|
|
|
|
|
|
|
| 56 |
if all(re.match(r'^[A-ZÀ-ÖØ-Þ][\w\-]*', w) for w in words):
|
| 57 |
return line
|
|
|
|
| 58 |
base = os.path.basename(filename)
|
|
|
|
| 59 |
base = re.sub(r'\.(pdf|docx|doc)$', '', base, flags=re.I)
|
|
|
|
| 60 |
base = re.sub(r'[\._-]+', ' ', base)
|
|
|
|
| 61 |
base = re.sub(r'(?i)\b(cv|resume)\b', '', base)
|
| 62 |
base = re.sub(r'\s+', ' ', base).strip()
|
|
|
|
| 63 |
return base.title() if base else ''
|
| 64 |
|
| 65 |
+
def extract_entities(text: str) -> dict:
|
| 66 |
+
"""Extract structured info using NER model."""
|
| 67 |
+
entities = ner(text)
|
| 68 |
+
skills, education, experience = [], [], []
|
| 69 |
+
for ent in entities:
|
| 70 |
+
label = ent['entity_group'].upper()
|
| 71 |
+
word = ent['word'].strip()
|
| 72 |
+
if label in ["SKILL", "TECH", "TECHNOLOGY"]:
|
| 73 |
+
skills.append(word)
|
| 74 |
+
elif label in ["EDUCATION", "DEGREE", "QUALIFICATION"]:
|
| 75 |
+
education.append(word)
|
| 76 |
+
elif label in ["EXPERIENCE", "JOB", "ROLE"]:
|
| 77 |
+
experience.append(word)
|
| 78 |
+
return {
|
| 79 |
+
"skills": list(dict.fromkeys(skills)),
|
| 80 |
+
"education": list(dict.fromkeys(education)),
|
| 81 |
+
"experience": list(dict.fromkeys(experience))
|
| 82 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
|
| 84 |
def parse_resume(file_path: str, filename: str) -> dict:
|
| 85 |
+
"""Main function to parse resume fields."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
text = extract_text(file_path)
|
| 87 |
name = extract_name(text, filename)
|
| 88 |
+
ents = extract_entities(text)
|
|
|
|
|
|
|
| 89 |
return {
|
| 90 |
'name': name or '',
|
| 91 |
+
'skills': ', '.join(ents["skills"]) if ents["skills"] else '',
|
| 92 |
+
'education': ', '.join(ents["education"]) if ents["education"] else '',
|
| 93 |
+
'experience': ', '.join(ents["experience"]) if ents["experience"] else ''
|
| 94 |
+
}
|