Danial7 commited on
Commit
0cb6239
·
verified ·
1 Parent(s): 856e6a7

Create extractor.py

Browse files
Files changed (1) hide show
  1. extractor.py +25 -0
extractor.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spacy
2
+ import re
3
+ import pandas as pd
4
+
5
+ nlp = spacy.load("en_core_web_sm")
6
+
7
+ def extract_entities(text, skills_df):
8
+ doc = nlp(text)
9
+ tokens = [token.text for token in doc]
10
+ found_skills = list(set([token for token in tokens if token in skills_df["Skill"].values]))
11
+
12
+ # Background field classification
13
+ tech_keywords = {"Python", "ML", "Cloud", "DevOps", "AI"}
14
+ background = "technical" if any(skill in tech_keywords for skill in found_skills) else "non-technical"
15
+
16
+ return found_skills, background
17
+
18
+ def extract_experience_years(text):
19
+ # Matches patterns like: "2018 - 2021" or "Jan 2017 – Mar 2023"
20
+ date_pattern = r"\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)?\s?\d{4}"
21
+ dates = re.findall(date_pattern, text)
22
+ years = [int(s[-4:]) for s in dates if s[-4:].isdigit()]
23
+ if len(years) >= 2:
24
+ return max(years) - min(years)
25
+ return 0