File size: 1,597 Bytes
c48432d
93a8810
 
 
 
c48432d
 
 
 
 
 
 
 
 
 
 
 
 
 
93a8810
c48432d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import fitz  # PyMuPDF for PDF text extraction
import spacy

nlp = spacy.load("en_core_web_sm")

EDUCATION_LEVELS = {
    "phd": "PhD",
    "doctorate": "PhD",
    "masters": "Masters",
    "master": "Masters",
    "bachelor": "Bachelors",
    "bsc": "Bachelors",
    "ba": "Bachelors",
    "diploma": "Diploma",
    "high school": "High School",
    "secondary school": "High School"
}

def extract_text_from_pdf(file_path):
    text = ""
    doc = fitz.open(file_path)
    for page in doc:
        text += page.get_text()
    return text

def parse_cv(file_path):
    text = extract_text_from_pdf(file_path)
    doc = nlp(text)
    # You can add more parsing logic here if needed
    return text

def extract_education_level(text):
    text_lower = text.lower()
    for key, level in EDUCATION_LEVELS.items():
        if key in text_lower:
            return level
    return "Not Found"

def identify_cv_type(text):
    technical_keywords = ["python", "java", "c++", "sql", "software", "engineering", "developer", "data science", "machine learning", "it", "technology"]
    non_technical_keywords = ["management", "sales", "marketing", "human resources", "hr", "customer service", "finance", "accounting", "education", "teaching"]

    text_lower = text.lower()
    tech_matches = sum(word in text_lower for word in technical_keywords)
    non_tech_matches = sum(word in text_lower for word in non_technical_keywords)

    if tech_matches > non_tech_matches:
        return "Technical"
    elif non_tech_matches > tech_matches:
        return "Non-Technical"
    else:
        return "Unknown"