Spaces:
Paused
Paused
Commit
·
b8deff5
1
Parent(s):
288175b
updated
Browse files
backend/services/resume_parser.py
CHANGED
|
@@ -6,49 +6,47 @@ from pdfminer.high_level import extract_text as pdf_extract_text
|
|
| 6 |
from docx import Document
|
| 7 |
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
|
| 8 |
|
| 9 |
-
|
| 10 |
-
# Load PyTorch Resume NER Model
|
| 11 |
-
# --------------------
|
| 12 |
-
MODEL_NAME = "manishiitg/resume-ner" # Works with PyTorch on Hugging Face Spaces
|
| 13 |
|
| 14 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
| 15 |
model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)
|
| 16 |
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
|
| 17 |
|
| 18 |
-
# --------------------
|
| 19 |
-
# Extract Text from PDF/DOCX
|
| 20 |
-
# --------------------
|
| 21 |
def extract_text(file_path: str) -> str:
|
| 22 |
path = Path(file_path)
|
| 23 |
if path.suffix.lower() == ".pdf":
|
| 24 |
-
|
| 25 |
elif path.suffix.lower() == ".docx":
|
| 26 |
doc = Document(file_path)
|
| 27 |
-
|
| 28 |
else:
|
| 29 |
raise ValueError("Unsupported file format")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
|
| 31 |
-
# --------------------
|
| 32 |
-
# Parse Resume (returns only: full name, skills, education, experience)
|
| 33 |
-
# --------------------
|
| 34 |
def parse_resume(file_path: str, filename: str = None) -> Dict[str, str]:
|
| 35 |
text = extract_text(file_path)
|
| 36 |
entities = ner_pipeline(text)
|
| 37 |
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
|
|
|
|
|
|
|
|
|
| 42 |
|
| 43 |
for ent in entities:
|
| 44 |
label = ent["entity_group"].upper()
|
| 45 |
value = ent["word"].strip()
|
| 46 |
|
| 47 |
-
if label
|
| 48 |
name_parts.append(value)
|
| 49 |
-
elif label
|
| 50 |
skills.append(value)
|
| 51 |
-
elif label in ["EDUCATION", "DEGREE"]:
|
| 52 |
education.append(value)
|
| 53 |
elif label in ["EXPERIENCE", "JOB", "ROLE", "POSITION"]:
|
| 54 |
experience.append(value)
|
|
|
|
| 6 |
from docx import Document
|
| 7 |
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
|
| 8 |
|
| 9 |
+
MODEL_NAME = "manishiitg/resume-ner"
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
| 12 |
model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)
|
| 13 |
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
|
| 14 |
|
|
|
|
|
|
|
|
|
|
| 15 |
def extract_text(file_path: str) -> str:
|
| 16 |
path = Path(file_path)
|
| 17 |
if path.suffix.lower() == ".pdf":
|
| 18 |
+
text = pdf_extract_text(file_path)
|
| 19 |
elif path.suffix.lower() == ".docx":
|
| 20 |
doc = Document(file_path)
|
| 21 |
+
text = "\n".join([p.text for p in doc.paragraphs])
|
| 22 |
else:
|
| 23 |
raise ValueError("Unsupported file format")
|
| 24 |
+
|
| 25 |
+
# Clean text
|
| 26 |
+
text = text.replace("\n", " ").replace("\r", " ").strip()
|
| 27 |
+
return text
|
| 28 |
|
|
|
|
|
|
|
|
|
|
| 29 |
def parse_resume(file_path: str, filename: str = None) -> Dict[str, str]:
|
| 30 |
text = extract_text(file_path)
|
| 31 |
entities = ner_pipeline(text)
|
| 32 |
|
| 33 |
+
# Debug: Print actual detected entities
|
| 34 |
+
print("\n=== DEBUG: Entities Detected ===")
|
| 35 |
+
for ent in entities:
|
| 36 |
+
print(f"{ent['entity_group']} => {ent['word']}")
|
| 37 |
+
print("==============================\n")
|
| 38 |
+
|
| 39 |
+
name_parts, skills, education, experience = [], [], [], []
|
| 40 |
|
| 41 |
for ent in entities:
|
| 42 |
label = ent["entity_group"].upper()
|
| 43 |
value = ent["word"].strip()
|
| 44 |
|
| 45 |
+
if label in ["NAME", "PERSON"]:
|
| 46 |
name_parts.append(value)
|
| 47 |
+
elif label in ["SKILL", "SKILLS"]:
|
| 48 |
skills.append(value)
|
| 49 |
+
elif label in ["EDUCATION", "DEGREE", "QUALIFICATION"]:
|
| 50 |
education.append(value)
|
| 51 |
elif label in ["EXPERIENCE", "JOB", "ROLE", "POSITION"]:
|
| 52 |
experience.append(value)
|