Spaces:
Paused
Paused
Commit
·
ff62567
1
Parent(s):
af02e64
parse resume added
Browse files
backend/services/resume_parser.py
CHANGED
|
@@ -98,7 +98,20 @@ def extract_text(file_path: str) -> str:
|
|
| 98 |
stderr=subprocess.PIPE,
|
| 99 |
check=False
|
| 100 |
)
|
| 101 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
except Exception:
|
| 103 |
return ""
|
| 104 |
# If it's a .docx treat it as a zip archive and pull the main
|
|
@@ -217,7 +230,7 @@ def extract_skills(text: str) -> List[str]:
|
|
| 217 |
found = []
|
| 218 |
for skill in SKILLS:
|
| 219 |
pattern = re.escape(skill.lower())
|
| 220 |
-
if re.search(r'\b' + pattern + r'\b', lower_text):
|
| 221 |
# Preserve the original capitalisation of the skill phrase
|
| 222 |
found.append(skill.title() if skill.islower() else skill)
|
| 223 |
return list(dict.fromkeys(found)) # Remove duplicates, preserve order
|
|
@@ -243,9 +256,11 @@ def extract_education(text: str) -> List[str]:
|
|
| 243 |
return []
|
| 244 |
lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
|
| 245 |
education_keywords = [
|
| 246 |
-
'university', 'college', 'bachelor', '
|
| 247 |
-
'
|
|
|
|
| 248 |
]
|
|
|
|
| 249 |
results = []
|
| 250 |
for line in lines:
|
| 251 |
lower = line.lower()
|
|
|
|
| 98 |
stderr=subprocess.PIPE,
|
| 99 |
check=False
|
| 100 |
)
|
| 101 |
+
raw_text = result.stdout.decode('utf-8', errors='ignore')
|
| 102 |
+
# Normalize whitespace and ensure section keywords are on separate lines
|
| 103 |
+
raw_text = re.sub(r'(?i)(education)', r'\n\1\n', raw_text)
|
| 104 |
+
raw_text = re.sub(r'(?i)(experience)', r'\n\1\n', raw_text)
|
| 105 |
+
raw_text = re.sub(r'(?i)(skills?)', r'\n\1\n', raw_text)
|
| 106 |
+
# Replace multiple spaces/tabs but keep newlines
|
| 107 |
+
raw_text = re.sub(r'[ \t]+', ' ', raw_text)
|
| 108 |
+
# Ensure section keywords are isolated
|
| 109 |
+
raw_text = re.sub(r'(?i)(education)', r'\n\1\n', raw_text)
|
| 110 |
+
raw_text = re.sub(r'(?i)(experience)', r'\n\1\n', raw_text)
|
| 111 |
+
raw_text = re.sub(r'(?i)(skills?)', r'\n\1\n', raw_text)
|
| 112 |
+
return raw_text
|
| 113 |
+
|
| 114 |
+
|
| 115 |
except Exception:
|
| 116 |
return ""
|
| 117 |
# If it's a .docx treat it as a zip archive and pull the main
|
|
|
|
| 230 |
found = []
|
| 231 |
for skill in SKILLS:
|
| 232 |
pattern = re.escape(skill.lower())
|
| 233 |
+
if re.search(r'\b' + pattern + r'(\b|[^a-zA-Z])', lower_text):
|
| 234 |
# Preserve the original capitalisation of the skill phrase
|
| 235 |
found.append(skill.title() if skill.islower() else skill)
|
| 236 |
return list(dict.fromkeys(found)) # Remove duplicates, preserve order
|
|
|
|
| 256 |
return []
|
| 257 |
lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
|
| 258 |
education_keywords = [
|
| 259 |
+
'university', 'college', 'bachelor', 'bachelors', 'master', 'masters',
|
| 260 |
+
'phd', 'b.sc', 'bsc', 'm.sc', 'msc', 'mba', 'school', 'degree',
|
| 261 |
+
'diploma', 'engineering', 'work history'
|
| 262 |
]
|
| 263 |
+
|
| 264 |
results = []
|
| 265 |
for line in lines:
|
| 266 |
lower = line.lower()
|