Spaces:
Sleeping
Sleeping
File size: 1,597 Bytes
c48432d 93a8810 c48432d 93a8810 c48432d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 |
import fitz # PyMuPDF for PDF text extraction
import spacy
nlp = spacy.load("en_core_web_sm")
EDUCATION_LEVELS = {
"phd": "PhD",
"doctorate": "PhD",
"masters": "Masters",
"master": "Masters",
"bachelor": "Bachelors",
"bsc": "Bachelors",
"ba": "Bachelors",
"diploma": "Diploma",
"high school": "High School",
"secondary school": "High School"
}
def extract_text_from_pdf(file_path):
text = ""
doc = fitz.open(file_path)
for page in doc:
text += page.get_text()
return text
def parse_cv(file_path):
text = extract_text_from_pdf(file_path)
doc = nlp(text)
# You can add more parsing logic here if needed
return text
def extract_education_level(text):
text_lower = text.lower()
for key, level in EDUCATION_LEVELS.items():
if key in text_lower:
return level
return "Not Found"
def identify_cv_type(text):
technical_keywords = ["python", "java", "c++", "sql", "software", "engineering", "developer", "data science", "machine learning", "it", "technology"]
non_technical_keywords = ["management", "sales", "marketing", "human resources", "hr", "customer service", "finance", "accounting", "education", "teaching"]
text_lower = text.lower()
tech_matches = sum(word in text_lower for word in technical_keywords)
non_tech_matches = sum(word in text_lower for word in non_technical_keywords)
if tech_matches > non_tech_matches:
return "Technical"
elif non_tech_matches > tech_matches:
return "Non-Technical"
else:
return "Unknown"
|