Danial7 commited on
Commit
c48432d
·
verified ·
1 Parent(s): 93a8810

Update utils/parser.py

Browse files
Files changed (1) hide show
  1. utils/parser.py +47 -21
utils/parser.py CHANGED
@@ -1,26 +1,52 @@
1
-
2
- import fitz # PyMuPDF
3
  import spacy
4
 
5
  nlp = spacy.load("en_core_web_sm")
6
 
7
- def parse_cv(file_path):
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  text = ""
9
- with fitz.open(file_path) as doc:
10
- for page in doc:
11
- text += page.get_text()
12
-
13
- doc_nlp = nlp(text)
14
- education_level = "Unknown"
15
- if "bachelor" in text.lower():
16
- education_level = "Bachelor's Degree"
17
- elif "master" in text.lower():
18
- education_level = "Master's Degree"
19
- elif "phd" in text.lower() or "doctor" in text.lower():
20
- education_level = "PhD or Doctorate"
21
- elif "high school" in text.lower():
22
- education_level = "High School"
23
-
24
- cv_type = "Technical" if any(tok.text.lower() in ["engineer", "developer", "python", "data"] for tok in doc_nlp) else "Non-Technical"
25
-
26
- return text, education_level, cv_type
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fitz # PyMuPDF for PDF text extraction
 
2
  import spacy
3
 
4
  nlp = spacy.load("en_core_web_sm")
5
 
6
+ EDUCATION_LEVELS = {
7
+ "phd": "PhD",
8
+ "doctorate": "PhD",
9
+ "masters": "Masters",
10
+ "master": "Masters",
11
+ "bachelor": "Bachelors",
12
+ "bsc": "Bachelors",
13
+ "ba": "Bachelors",
14
+ "diploma": "Diploma",
15
+ "high school": "High School",
16
+ "secondary school": "High School"
17
+ }
18
+
19
+ def extract_text_from_pdf(file_path):
20
  text = ""
21
+ doc = fitz.open(file_path)
22
+ for page in doc:
23
+ text += page.get_text()
24
+ return text
25
+
26
+ def parse_cv(file_path):
27
+ text = extract_text_from_pdf(file_path)
28
+ doc = nlp(text)
29
+ # You can add more parsing logic here if needed
30
+ return text
31
+
32
+ def extract_education_level(text):
33
+ text_lower = text.lower()
34
+ for key, level in EDUCATION_LEVELS.items():
35
+ if key in text_lower:
36
+ return level
37
+ return "Not Found"
38
+
39
+ def identify_cv_type(text):
40
+ technical_keywords = ["python", "java", "c++", "sql", "software", "engineering", "developer", "data science", "machine learning", "it", "technology"]
41
+ non_technical_keywords = ["management", "sales", "marketing", "human resources", "hr", "customer service", "finance", "accounting", "education", "teaching"]
42
+
43
+ text_lower = text.lower()
44
+ tech_matches = sum(word in text_lower for word in technical_keywords)
45
+ non_tech_matches = sum(word in text_lower for word in non_technical_keywords)
46
+
47
+ if tech_matches > non_tech_matches:
48
+ return "Technical"
49
+ elif non_tech_matches > tech_matches:
50
+ return "Non-Technical"
51
+ else:
52
+ return "Unknown"