cv-parser-api / parser_core.py
nadinekhaled500's picture
Upload 5 files
646eb9d verified
import spacy
from huggingface_hub import snapshot_download
import pdfplumber
from docx import Document
import re
# Load general spaCy model for locations
nlp_general = spacy.load("en_core_web_lg")
# Download and load the skill-extractor model from Hugging Face
model_path = snapshot_download("amjad-awad/skill-extractor", repo_type="model")
nlp_skills = spacy.load(model_path) # Dedicated model for skills
def read_pdf(path):
text = []
with pdfplumber.open(path) as pdf:
for p in pdf.pages:
text.append(p.extract_text() or "")
return "\n".join(text)
def read_docx(path):
doc = Document(path)
return "\n".join([p.text for p in doc.paragraphs])
def read_file(path):
if path.endswith(".pdf"):
return read_pdf(path)
elif path.endswith(".docx"):
return read_docx(path)
else:
raise ValueError("Unsupported file type.")
def extract_location(text):
doc = nlp_general(text)
locs = [ent.text for ent in doc.ents if ent.label_ in ("GPE", "LOC")]
return list(set(locs))
def extract_experience(text):
exp_lines = []
# Keywords to identify actual experience lines
experience_keywords = [
"experience", "intern", "trainee", "developer", "engineer", "project",
"job", "specialist", "analyst", "manager", "consultant", "architect",
"scientist", "coordinator", "assistant", "lead", "head", "director",
"associate", "fellow", "program", "role", "position", "work", "co-op", "researcher", "officer"
]
for line in text.split("\n"):
original_line = line.strip()
if not original_line:
continue
# Remove common bullet points and other leading non-alphanumeric chars
processed_line = re.sub(r'^[\s\u2022\-\d\*\-–—\.]+\s*', '', original_line)
# Convert to lowercase for case-insensitive keyword checking
lower_processed_line = processed_line.lower()
# Check if any experience keyword is present in the line
if any(key in lower_processed_line for key in experience_keywords):
# Basic cleaning: Normalize spaces
cleaned_line = re.sub(r'\s+', ' ', processed_line).strip()
# Further filter out lines that are too short or just numbers
if len(cleaned_line) > 5 and any(c.isalpha() for c in cleaned_line):
exp_lines.append(cleaned_line)
# Use set to deduplicate, then convert back to list for consistent output
return list(set(exp_lines))
def extract_skills(text):
doc = nlp_skills(text) # Use the dedicated skills model
skills = [ent.text for ent in doc.ents if "SKILLS" in ent.label_] # Extract SKILLS entities
# Clean up: Deduplicate and filter short/irrelevant
skills = list(set([s.strip() for s in skills if len(s) > 2]))
return skills
def parse_cv(path):
text = read_file(path)
return {
"skills": extract_skills(text),
"experience": extract_experience(text),
"location": extract_location(text)
}