husseinelsaadi commited on
Commit
471f933
·
1 Parent(s): b8deff5
Files changed (1) hide show
  1. backend/services/resume_parser.py +25 -27
backend/services/resume_parser.py CHANGED
@@ -1,4 +1,4 @@
1
- import json
2
  from pathlib import Path
3
  from typing import Dict
4
 
@@ -12,6 +12,11 @@ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
12
  model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)
13
  ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
14
 
 
 
 
 
 
15
  def extract_text(file_path: str) -> str:
16
  path = Path(file_path)
17
  if path.suffix.lower() == ".pdf":
@@ -21,41 +26,34 @@ def extract_text(file_path: str) -> str:
21
  text = "\n".join([p.text for p in doc.paragraphs])
22
  else:
23
  raise ValueError("Unsupported file format")
24
-
25
- # Clean text
26
- text = text.replace("\n", " ").replace("\r", " ").strip()
27
- return text
28
 
29
  def parse_resume(file_path: str, filename: str = None) -> Dict[str, str]:
30
  text = extract_text(file_path)
31
  entities = ner_pipeline(text)
32
 
33
- # Debug: Print actual detected entities
34
- print("\n=== DEBUG: Entities Detected ===")
35
- for ent in entities:
36
- print(f"{ent['entity_group']} => {ent['word']}")
37
- print("==============================\n")
38
-
39
- name_parts, skills, education, experience = [], [], [], []
40
 
41
- for ent in entities:
42
- label = ent["entity_group"].upper()
43
- value = ent["word"].strip()
44
 
45
- if label in ["NAME", "PERSON"]:
46
- name_parts.append(value)
47
- elif label in ["SKILL", "SKILLS"]:
48
- skills.append(value)
49
- elif label in ["EDUCATION", "DEGREE", "QUALIFICATION"]:
50
- education.append(value)
51
- elif label in ["EXPERIENCE", "JOB", "ROLE", "POSITION"]:
52
- experience.append(value)
53
 
54
- full_name = " ".join(dict.fromkeys(name_parts)) or "Not Found"
 
 
 
 
 
 
 
55
 
56
  return {
57
  "name": full_name,
58
- "skills": ", ".join(dict.fromkeys(skills)) or "Not Found",
59
- "education": ", ".join(dict.fromkeys(education)) or "Not Found",
60
- "experience": ", ".join(dict.fromkeys(experience)) or "Not Found"
61
  }
 
1
+ import re
2
  from pathlib import Path
3
  from typing import Dict
4
 
 
12
  model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)
13
  ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
14
 
15
+ # Basic keyword lists (you can expand dynamically if needed)
16
+ SKILL_KEYWORDS = ["python", "java", "sql", "docker", "aws", "machine learning", "flask", "django", "react"]
17
+ EDU_KEYWORDS = ["bachelor", "master", "phd", "bsc", "msc", "mba", "computer science", "engineering"]
18
+ JOB_KEYWORDS = ["engineer", "developer", "manager", "analyst", "consultant", "specialist"]
19
+
20
  def extract_text(file_path: str) -> str:
21
  path = Path(file_path)
22
  if path.suffix.lower() == ".pdf":
 
26
  text = "\n".join([p.text for p in doc.paragraphs])
27
  else:
28
  raise ValueError("Unsupported file format")
29
+ return text.replace("\n", " ").replace("\r", " ").strip()
 
 
 
30
 
31
  def parse_resume(file_path: str, filename: str = None) -> Dict[str, str]:
32
  text = extract_text(file_path)
33
  entities = ner_pipeline(text)
34
 
35
+ # Model extraction (Name only works well)
36
+ name_parts = [ent["word"].strip() for ent in entities if ent["entity_group"].upper() in ["NAME", "PERSON"]]
37
+ full_name = " ".join(dict.fromkeys(name_parts)) or "Not Found"
 
 
 
 
38
 
39
+ # Skills fallback
40
+ skills_found = [skill for skill in SKILL_KEYWORDS if re.search(rf"\b{skill}\b", text, re.IGNORECASE)]
 
41
 
42
+ # Education fallback
43
+ education_found = [edu for edu in EDU_KEYWORDS if re.search(rf"\b{edu}\b", text, re.IGNORECASE)]
 
 
 
 
 
 
44
 
45
+ # Experience fallback
46
+ experience_found = []
47
+ for job in JOB_KEYWORDS:
48
+ if re.search(rf"\b{job}\b", text, re.IGNORECASE):
49
+ experience_found.append(job)
50
+ years_match = re.findall(r"(\d+)\s*(?:years|yrs)", text, re.IGNORECASE)
51
+ if years_match:
52
+ experience_found.append(f"{max(map(int, years_match))} years")
53
 
54
  return {
55
  "name": full_name,
56
+ "skills": ", ".join(set(skills_found)) or "Not Found",
57
+ "education": ", ".join(set(education_found)) or "Not Found",
58
+ "experience": ", ".join(set(experience_found)) or "Not Found"
59
  }