Spaces:
Sleeping
Sleeping
| import fitz # PyMuPDF for PDF text extraction | |
| import spacy | |
| nlp = spacy.load("en_core_web_sm") | |
| EDUCATION_LEVELS = { | |
| "phd": "PhD", | |
| "doctorate": "PhD", | |
| "masters": "Masters", | |
| "master": "Masters", | |
| "bachelor": "Bachelors", | |
| "bsc": "Bachelors", | |
| "ba": "Bachelors", | |
| "diploma": "Diploma", | |
| "high school": "High School", | |
| "secondary school": "High School" | |
| } | |
| def extract_text_from_pdf(file_path): | |
| text = "" | |
| doc = fitz.open(file_path) | |
| for page in doc: | |
| text += page.get_text() | |
| return text | |
| def parse_cv(file_path): | |
| text = extract_text_from_pdf(file_path) | |
| doc = nlp(text) | |
| # You can add more parsing logic here if needed | |
| return text | |
| def extract_education_level(text): | |
| text_lower = text.lower() | |
| for key, level in EDUCATION_LEVELS.items(): | |
| if key in text_lower: | |
| return level | |
| return "Not Found" | |
| def identify_cv_type(text): | |
| technical_keywords = ["python", "java", "c++", "sql", "software", "engineering", "developer", "data science", "machine learning", "it", "technology"] | |
| non_technical_keywords = ["management", "sales", "marketing", "human resources", "hr", "customer service", "finance", "accounting", "education", "teaching"] | |
| text_lower = text.lower() | |
| tech_matches = sum(word in text_lower for word in technical_keywords) | |
| non_tech_matches = sum(word in text_lower for word in non_technical_keywords) | |
| if tech_matches > non_tech_matches: | |
| return "Technical" | |
| elif non_tech_matches > tech_matches: | |
| return "Non-Technical" | |
| else: | |
| return "Unknown" | |