Spaces:
Paused
Paused
Commit
·
471f933
1
Parent(s):
b8deff5
upadted
Browse files
backend/services/resume_parser.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
import
|
| 2 |
from pathlib import Path
|
| 3 |
from typing import Dict
|
| 4 |
|
|
@@ -12,6 +12,11 @@ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
|
| 12 |
model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)
|
| 13 |
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
|
| 14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
def extract_text(file_path: str) -> str:
|
| 16 |
path = Path(file_path)
|
| 17 |
if path.suffix.lower() == ".pdf":
|
|
@@ -21,41 +26,34 @@ def extract_text(file_path: str) -> str:
|
|
| 21 |
text = "\n".join([p.text for p in doc.paragraphs])
|
| 22 |
else:
|
| 23 |
raise ValueError("Unsupported file format")
|
| 24 |
-
|
| 25 |
-
# Clean text
|
| 26 |
-
text = text.replace("\n", " ").replace("\r", " ").strip()
|
| 27 |
-
return text
|
| 28 |
|
| 29 |
def parse_resume(file_path: str, filename: str = None) -> Dict[str, str]:
|
| 30 |
text = extract_text(file_path)
|
| 31 |
entities = ner_pipeline(text)
|
| 32 |
|
| 33 |
-
#
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
print(f"{ent['entity_group']} => {ent['word']}")
|
| 37 |
-
print("==============================\n")
|
| 38 |
-
|
| 39 |
-
name_parts, skills, education, experience = [], [], [], []
|
| 40 |
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
value = ent["word"].strip()
|
| 44 |
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
elif label in ["SKILL", "SKILLS"]:
|
| 48 |
-
skills.append(value)
|
| 49 |
-
elif label in ["EDUCATION", "DEGREE", "QUALIFICATION"]:
|
| 50 |
-
education.append(value)
|
| 51 |
-
elif label in ["EXPERIENCE", "JOB", "ROLE", "POSITION"]:
|
| 52 |
-
experience.append(value)
|
| 53 |
|
| 54 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
|
| 56 |
return {
|
| 57 |
"name": full_name,
|
| 58 |
-
"skills": ", ".join(
|
| 59 |
-
"education": ", ".join(
|
| 60 |
-
"experience": ", ".join(
|
| 61 |
}
|
|
|
|
| 1 |
+
import re
|
| 2 |
from pathlib import Path
|
| 3 |
from typing import Dict
|
| 4 |
|
|
|
|
| 12 |
model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)
|
| 13 |
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
|
| 14 |
|
| 15 |
+
# Basic keyword lists (you can expand dynamically if needed)
|
| 16 |
+
SKILL_KEYWORDS = ["python", "java", "sql", "docker", "aws", "machine learning", "flask", "django", "react"]
|
| 17 |
+
EDU_KEYWORDS = ["bachelor", "master", "phd", "bsc", "msc", "mba", "computer science", "engineering"]
|
| 18 |
+
JOB_KEYWORDS = ["engineer", "developer", "manager", "analyst", "consultant", "specialist"]
|
| 19 |
+
|
| 20 |
def extract_text(file_path: str) -> str:
|
| 21 |
path = Path(file_path)
|
| 22 |
if path.suffix.lower() == ".pdf":
|
|
|
|
| 26 |
text = "\n".join([p.text for p in doc.paragraphs])
|
| 27 |
else:
|
| 28 |
raise ValueError("Unsupported file format")
|
| 29 |
+
return text.replace("\n", " ").replace("\r", " ").strip()
|
|
|
|
|
|
|
|
|
|
| 30 |
|
| 31 |
def parse_resume(file_path: str, filename: str = None) -> Dict[str, str]:
|
| 32 |
text = extract_text(file_path)
|
| 33 |
entities = ner_pipeline(text)
|
| 34 |
|
| 35 |
+
# Model extraction (Name only works well)
|
| 36 |
+
name_parts = [ent["word"].strip() for ent in entities if ent["entity_group"].upper() in ["NAME", "PERSON"]]
|
| 37 |
+
full_name = " ".join(dict.fromkeys(name_parts)) or "Not Found"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
|
| 39 |
+
# Skills fallback
|
| 40 |
+
skills_found = [skill for skill in SKILL_KEYWORDS if re.search(rf"\b{skill}\b", text, re.IGNORECASE)]
|
|
|
|
| 41 |
|
| 42 |
+
# Education fallback
|
| 43 |
+
education_found = [edu for edu in EDU_KEYWORDS if re.search(rf"\b{edu}\b", text, re.IGNORECASE)]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
|
| 45 |
+
# Experience fallback
|
| 46 |
+
experience_found = []
|
| 47 |
+
for job in JOB_KEYWORDS:
|
| 48 |
+
if re.search(rf"\b{job}\b", text, re.IGNORECASE):
|
| 49 |
+
experience_found.append(job)
|
| 50 |
+
years_match = re.findall(r"(\d+)\s*(?:years|yrs)", text, re.IGNORECASE)
|
| 51 |
+
if years_match:
|
| 52 |
+
experience_found.append(f"{max(map(int, years_match))} years")
|
| 53 |
|
| 54 |
return {
|
| 55 |
"name": full_name,
|
| 56 |
+
"skills": ", ".join(set(skills_found)) or "Not Found",
|
| 57 |
+
"education": ", ".join(set(education_found)) or "Not Found",
|
| 58 |
+
"experience": ", ".join(set(experience_found)) or "Not Found"
|
| 59 |
}
|