Spaces:

Danial7
/

CV_Analyzer_Final

Sleeping

App Files Files Community

Danial7 commited on May 18, 2025

Commit

c48432d

verified ·

1 Parent(s): 93a8810

Update utils/parser.py

Browse files

Files changed (1) hide show

utils/parser.py +47 -21

utils/parser.py CHANGED Viewed

@@ -1,26 +1,52 @@
-import fitz  # PyMuPDF
 import spacy
 nlp = spacy.load("en_core_web_sm")
-def parse_cv(file_path):
     text = ""
-    with fitz.open(file_path) as doc:
-        for page in doc:
-            text += page.get_text()
-    doc_nlp = nlp(text)
-    education_level = "Unknown"
-    if "bachelor" in text.lower():
-        education_level = "Bachelor's Degree"
-    elif "master" in text.lower():
-        education_level = "Master's Degree"
-    elif "phd" in text.lower() or "doctor" in text.lower():
-        education_level = "PhD or Doctorate"
-    elif "high school" in text.lower():
-        education_level = "High School"
-    cv_type = "Technical" if any(tok.text.lower() in ["engineer", "developer", "python", "data"] for tok in doc_nlp) else "Non-Technical"
-    return text, education_level, cv_type

+import fitz  # PyMuPDF for PDF text extraction
 import spacy
 nlp = spacy.load("en_core_web_sm")
+EDUCATION_LEVELS = {
+    "phd": "PhD",
+    "doctorate": "PhD",
+    "masters": "Masters",
+    "master": "Masters",
+    "bachelor": "Bachelors",
+    "bsc": "Bachelors",
+    "ba": "Bachelors",
+    "diploma": "Diploma",
+    "high school": "High School",
+    "secondary school": "High School"
+}
+def extract_text_from_pdf(file_path):
     text = ""
+    doc = fitz.open(file_path)
+    for page in doc:
+        text += page.get_text()
+    return text
+def parse_cv(file_path):
+    text = extract_text_from_pdf(file_path)
+    doc = nlp(text)
+    # You can add more parsing logic here if needed
+    return text
+def extract_education_level(text):
+    text_lower = text.lower()
+    for key, level in EDUCATION_LEVELS.items():
+        if key in text_lower:
+            return level
+    return "Not Found"
+def identify_cv_type(text):
+    technical_keywords = ["python", "java", "c++", "sql", "software", "engineering", "developer", "data science", "machine learning", "it", "technology"]
+    non_technical_keywords = ["management", "sales", "marketing", "human resources", "hr", "customer service", "finance", "accounting", "education", "teaching"]
+    text_lower = text.lower()
+    tech_matches = sum(word in text_lower for word in technical_keywords)
+    non_tech_matches = sum(word in text_lower for word in non_technical_keywords)
+    if tech_matches > non_tech_matches:
+        return "Technical"
+    elif non_tech_matches > tech_matches:
+        return "Non-Technical"
+    else:
+        return "Unknown"