import fitz  # PyMuPDF for PDF text extraction
import spacy

nlp = spacy.load("en_core_web_sm")

EDUCATION_LEVELS = {
    "phd": "PhD",
    "doctorate": "PhD",
    "masters": "Masters",
    "master": "Masters",
    "bachelor": "Bachelors",
    "bsc": "Bachelors",
    "ba": "Bachelors",
    "diploma": "Diploma",
    "high school": "High School",
    "secondary school": "High School"
}

def extract_text_from_pdf(file_path):
    text = ""
    doc = fitz.open(file_path)
    for page in doc:
        text += page.get_text()
    return text

def parse_cv(file_path):
    text = extract_text_from_pdf(file_path)
    doc = nlp(text)
    # You can add more parsing logic here if needed
    return text

def extract_education_level(text):
    text_lower = text.lower()
    for key, level in EDUCATION_LEVELS.items():
        if key in text_lower:
            return level
    return "Not Found"

def identify_cv_type(text):
    technical_keywords = ["python", "java", "c++", "sql", "software", "engineering", "developer", "data science", "machine learning", "it", "technology"]
    non_technical_keywords = ["management", "sales", "marketing", "human resources", "hr", "customer service", "finance", "accounting", "education", "teaching"]

    text_lower = text.lower()
    tech_matches = sum(word in text_lower for word in technical_keywords)
    non_tech_matches = sum(word in text_lower for word in non_technical_keywords)

    if tech_matches > non_tech_matches:
        return "Technical"
    elif non_tech_matches > tech_matches:
        return "Non-Technical"
    else:
        return "Unknown"