import fitz # PyMuPDF for PDF text extraction import spacy nlp = spacy.load("en_core_web_sm") EDUCATION_LEVELS = { "phd": "PhD", "doctorate": "PhD", "masters": "Masters", "master": "Masters", "bachelor": "Bachelors", "bsc": "Bachelors", "ba": "Bachelors", "diploma": "Diploma", "high school": "High School", "secondary school": "High School" } def extract_text_from_pdf(file_path): text = "" doc = fitz.open(file_path) for page in doc: text += page.get_text() return text def parse_cv(file_path): text = extract_text_from_pdf(file_path) doc = nlp(text) # You can add more parsing logic here if needed return text def extract_education_level(text): text_lower = text.lower() for key, level in EDUCATION_LEVELS.items(): if key in text_lower: return level return "Not Found" def identify_cv_type(text): technical_keywords = ["python", "java", "c++", "sql", "software", "engineering", "developer", "data science", "machine learning", "it", "technology"] non_technical_keywords = ["management", "sales", "marketing", "human resources", "hr", "customer service", "finance", "accounting", "education", "teaching"] text_lower = text.lower() tech_matches = sum(word in text_lower for word in technical_keywords) non_tech_matches = sum(word in text_lower for word in non_technical_keywords) if tech_matches > non_tech_matches: return "Technical" elif non_tech_matches > tech_matches: return "Non-Technical" else: return "Unknown"