Spaces:

husseinelsaadi
/

Codingo

Paused

App Files Files Community

husseinelsaadi commited on Aug 4, 2025

Commit

471f933

1 Parent(s): b8deff5

upadted

Browse files

Files changed (1) hide show

backend/services/resume_parser.py +25 -27

backend/services/resume_parser.py CHANGED Viewed

@@ -1,4 +1,4 @@
-import json
 from pathlib import Path
 from typing import Dict
@@ -12,6 +12,11 @@ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
 model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)
 ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
 def extract_text(file_path: str) -> str:
     path = Path(file_path)
     if path.suffix.lower() == ".pdf":
@@ -21,41 +26,34 @@ def extract_text(file_path: str) -> str:
         text = "\n".join([p.text for p in doc.paragraphs])
     else:
         raise ValueError("Unsupported file format")
-    # Clean text
-    text = text.replace("\n", " ").replace("\r", " ").strip()
-    return text
 def parse_resume(file_path: str, filename: str = None) -> Dict[str, str]:
     text = extract_text(file_path)
     entities = ner_pipeline(text)
-    # Debug: Print actual detected entities
-    print("\n=== DEBUG: Entities Detected ===")
-    for ent in entities:
-        print(f"{ent['entity_group']} => {ent['word']}")
-    print("==============================\n")
-    name_parts, skills, education, experience = [], [], [], []
-    for ent in entities:
-        label = ent["entity_group"].upper()
-        value = ent["word"].strip()
-        if label in ["NAME", "PERSON"]:
-            name_parts.append(value)
-        elif label in ["SKILL", "SKILLS"]:
-            skills.append(value)
-        elif label in ["EDUCATION", "DEGREE", "QUALIFICATION"]:
-            education.append(value)
-        elif label in ["EXPERIENCE", "JOB", "ROLE", "POSITION"]:
-            experience.append(value)
-    full_name = " ".join(dict.fromkeys(name_parts)) or "Not Found"
     return {
         "name": full_name,
-        "skills": ", ".join(dict.fromkeys(skills)) or "Not Found",
-        "education": ", ".join(dict.fromkeys(education)) or "Not Found",
-        "experience": ", ".join(dict.fromkeys(experience)) or "Not Found"
     }

+import re
 from pathlib import Path
 from typing import Dict
 model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)
 ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
+# Basic keyword lists (you can expand dynamically if needed)
+SKILL_KEYWORDS = ["python", "java", "sql", "docker", "aws", "machine learning", "flask", "django", "react"]
+EDU_KEYWORDS = ["bachelor", "master", "phd", "bsc", "msc", "mba", "computer science", "engineering"]
+JOB_KEYWORDS = ["engineer", "developer", "manager", "analyst", "consultant", "specialist"]
 def extract_text(file_path: str) -> str:
     path = Path(file_path)
     if path.suffix.lower() == ".pdf":
         text = "\n".join([p.text for p in doc.paragraphs])
     else:
         raise ValueError("Unsupported file format")
+    return text.replace("\n", " ").replace("\r", " ").strip()
 def parse_resume(file_path: str, filename: str = None) -> Dict[str, str]:
     text = extract_text(file_path)
     entities = ner_pipeline(text)
+    # Model extraction (Name only works well)
+    name_parts = [ent["word"].strip() for ent in entities if ent["entity_group"].upper() in ["NAME", "PERSON"]]
+    full_name = " ".join(dict.fromkeys(name_parts)) or "Not Found"
+    # Skills fallback
+    skills_found = [skill for skill in SKILL_KEYWORDS if re.search(rf"\b{skill}\b", text, re.IGNORECASE)]
+    # Education fallback
+    education_found = [edu for edu in EDU_KEYWORDS if re.search(rf"\b{edu}\b", text, re.IGNORECASE)]
+    # Experience fallback
+    experience_found = []
+    for job in JOB_KEYWORDS:
+        if re.search(rf"\b{job}\b", text, re.IGNORECASE):
+            experience_found.append(job)
+    years_match = re.findall(r"(\d+)\s*(?:years|yrs)", text, re.IGNORECASE)
+    if years_match:
+        experience_found.append(f"{max(map(int, years_match))} years")
     return {
         "name": full_name,
+        "skills": ", ".join(set(skills_found)) or "Not Found",
+        "education": ", ".join(set(education_found)) or "Not Found",
+        "experience": ", ".join(set(experience_found)) or "Not Found"
     }