Spaces:

Danial7
/

skill_roadmap_app

Sleeping

App Files Files Community

Danial7 commited on May 15, 2025

Commit

14ce62c

verified ·

1 Parent(s): b680025

Update extractor.py

Browse files

Files changed (1) hide show

extractor.py +10 -43

extractor.py CHANGED Viewed

@@ -1,52 +1,19 @@
-import spacy
-import re
 import pdfplumber
-# Load the spaCy English model
 nlp = spacy.load("en_core_web_sm")
 def extract_text_from_pdf(file):
-    """Extracts raw text from a PDF using pdfplumber."""
     with pdfplumber.open(file) as pdf:
-        return "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())
 def extract_entities(text, skills_df):
-    """Extract skills and determine technical background."""
     doc = nlp(text)
-    # Skills are matched based on token-level matching with the dataset
-    skills = [token.text for token in doc if token.text in skills_df['Skill'].values]
-    technical_keywords = {"Python", "Cloud", "AI", "DevOps", "Security", "C++", "Linux"}
-    background = "technical" if any(skill in technical_keywords for skill in skills) else "non-technical"
-    return list(set(skills)), background
-def extract_experience_years(text):
-    """Roughly estimate years of experience based on common resume patterns."""
-    experience_patterns = [
-        r"(\d+)\+?\s+years? of experience",
-        r"experience\s+of\s+(\d+)\s+years",
-        r"(\d+)\s+years? experience",
-        r"(\d+)\s+years"
-    ]
-    for pattern in experience_patterns:
-        match = re.search(pattern, text, re.IGNORECASE)
-        if match:
-            return int(match.group(1))
-    return 0  # default if not found
-def classify_field(text):
-    """Classify resume into a field based on keywords."""
-    field_keywords = {
-        "Information Technology": ["python", "developer", "software", "IT", "java", "cloud", "linux"],
-        "Engineering": ["engineer", "autocad", "mechanical", "electrical", "civil"],
-        "Medical": ["nurse", "medical", "doctor", "clinic", "healthcare"],
-        "Finance": ["account", "finance", "bank", "tax", "auditor"],
-        "HVAC": ["hvac", "refrigeration", "ventilation", "chiller"],
-        "Technician": ["technician", "maintenance", "repair", "machinery"],
-        "Labor": ["labor", "helper", "construction", "warehouse", "manual"]
-    }
-    text_lower = text.lower()
-    for field, keywords in field_keywords.items():
-        if any(keyword in text_lower for keyword in keywords):
-            return field
-    return "General"

 import pdfplumber
+import spacy
 nlp = spacy.load("en_core_web_sm")
 def extract_text_from_pdf(file):
     with pdfplumber.open(file) as pdf:
+        return "\n".join([page.extract_text() for page in pdf.pages if page.extract_text()])
 def extract_entities(text, skills_df):
     doc = nlp(text)
+    tokens = [token.text.strip() for token in doc if token.text.strip()]
+    skills = list(set([token for token in tokens if token in skills_df["Skill"].values]))
+    tech_keywords = {"Python", "Machine Learning", "AI", "DevOps", "Data Science", "Cloud", "Cybersecurity"}
+    background = "technical" if any(skill in tech_keywords for skill in skills) else "non-technical"
+    # Dummy logic for years of experience
+    years_exp = 3
+    return skills, background, years_exp