Danial7 commited on
Commit
14ce62c
·
verified ·
1 Parent(s): b680025

Update extractor.py

Browse files
Files changed (1) hide show
  1. extractor.py +10 -43
extractor.py CHANGED
@@ -1,52 +1,19 @@
1
- import spacy
2
- import re
3
  import pdfplumber
 
4
 
5
- # Load the spaCy English model
6
  nlp = spacy.load("en_core_web_sm")
7
 
8
  def extract_text_from_pdf(file):
9
- """Extracts raw text from a PDF using pdfplumber."""
10
  with pdfplumber.open(file) as pdf:
11
- return "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())
12
 
13
  def extract_entities(text, skills_df):
14
- """Extract skills and determine technical background."""
15
  doc = nlp(text)
16
- # Skills are matched based on token-level matching with the dataset
17
- skills = [token.text for token in doc if token.text in skills_df['Skill'].values]
18
- technical_keywords = {"Python", "Cloud", "AI", "DevOps", "Security", "C++", "Linux"}
19
- background = "technical" if any(skill in technical_keywords for skill in skills) else "non-technical"
20
- return list(set(skills)), background
21
-
22
- def extract_experience_years(text):
23
- """Roughly estimate years of experience based on common resume patterns."""
24
- experience_patterns = [
25
- r"(\d+)\+?\s+years? of experience",
26
- r"experience\s+of\s+(\d+)\s+years",
27
- r"(\d+)\s+years? experience",
28
- r"(\d+)\s+years"
29
- ]
30
- for pattern in experience_patterns:
31
- match = re.search(pattern, text, re.IGNORECASE)
32
- if match:
33
- return int(match.group(1))
34
- return 0 # default if not found
35
-
36
- def classify_field(text):
37
- """Classify resume into a field based on keywords."""
38
- field_keywords = {
39
- "Information Technology": ["python", "developer", "software", "IT", "java", "cloud", "linux"],
40
- "Engineering": ["engineer", "autocad", "mechanical", "electrical", "civil"],
41
- "Medical": ["nurse", "medical", "doctor", "clinic", "healthcare"],
42
- "Finance": ["account", "finance", "bank", "tax", "auditor"],
43
- "HVAC": ["hvac", "refrigeration", "ventilation", "chiller"],
44
- "Technician": ["technician", "maintenance", "repair", "machinery"],
45
- "Labor": ["labor", "helper", "construction", "warehouse", "manual"]
46
- }
47
-
48
- text_lower = text.lower()
49
- for field, keywords in field_keywords.items():
50
- if any(keyword in text_lower for keyword in keywords):
51
- return field
52
- return "General"
 
 
 
1
  import pdfplumber
2
+ import spacy
3
 
 
4
  nlp = spacy.load("en_core_web_sm")
5
 
6
  def extract_text_from_pdf(file):
 
7
  with pdfplumber.open(file) as pdf:
8
+ return "\n".join([page.extract_text() for page in pdf.pages if page.extract_text()])
9
 
10
  def extract_entities(text, skills_df):
 
11
  doc = nlp(text)
12
+ tokens = [token.text.strip() for token in doc if token.text.strip()]
13
+ skills = list(set([token for token in tokens if token in skills_df["Skill"].values]))
14
+ tech_keywords = {"Python", "Machine Learning", "AI", "DevOps", "Data Science", "Cloud", "Cybersecurity"}
15
+ background = "technical" if any(skill in tech_keywords for skill in skills) else "non-technical"
16
+
17
+ # Dummy logic for years of experience
18
+ years_exp = 3
19
+ return skills, background, years_exp