Spaces:

nadinekhaled500
/

cv-parser-api

Running

App Files Files Community

cv-parser-api / parser_core.py

nadinekhaled500

Upload 5 files

646eb9d verified 17 days ago

raw

history blame contribute delete

3.04 kB

	import spacy
	from huggingface_hub import snapshot_download
	import pdfplumber
	from docx import Document
	import re

	# Load general spaCy model for locations
	nlp_general = spacy.load("en_core_web_lg")

	# Download and load the skill-extractor model from Hugging Face
	model_path = snapshot_download("amjad-awad/skill-extractor", repo_type="model")
	nlp_skills = spacy.load(model_path) # Dedicated model for skills

	def read_pdf(path):
	text = []
	with pdfplumber.open(path) as pdf:
	for p in pdf.pages:
	text.append(p.extract_text() or "")
	return "\n".join(text)

	def read_docx(path):
	doc = Document(path)
	return "\n".join([p.text for p in doc.paragraphs])

	def read_file(path):
	if path.endswith(".pdf"):
	return read_pdf(path)
	elif path.endswith(".docx"):
	return read_docx(path)
	else:
	raise ValueError("Unsupported file type.")

	def extract_location(text):
	doc = nlp_general(text)
	locs = [ent.text for ent in doc.ents if ent.label_ in ("GPE", "LOC")]
	return list(set(locs))

	def extract_experience(text):
	exp_lines = []

	# Keywords to identify actual experience lines
	experience_keywords = [
	"experience", "intern", "trainee", "developer", "engineer", "project",
	"job", "specialist", "analyst", "manager", "consultant", "architect",
	"scientist", "coordinator", "assistant", "lead", "head", "director",
	"associate", "fellow", "program", "role", "position", "work", "co-op", "researcher", "officer"
	]

	for line in text.split("\n"):
	original_line = line.strip()
	if not original_line:
	continue

	# Remove common bullet points and other leading non-alphanumeric chars
	processed_line = re.sub(r'^[\s\u2022\-\d\\-–—\.]+\s', '', original_line)

	# Convert to lowercase for case-insensitive keyword checking
	lower_processed_line = processed_line.lower()

	# Check if any experience keyword is present in the line
	if any(key in lower_processed_line for key in experience_keywords):
	# Basic cleaning: Normalize spaces
	cleaned_line = re.sub(r'\s+', ' ', processed_line).strip()
	# Further filter out lines that are too short or just numbers
	if len(cleaned_line) > 5 and any(c.isalpha() for c in cleaned_line):
	exp_lines.append(cleaned_line)

	# Use set to deduplicate, then convert back to list for consistent output
	return list(set(exp_lines))

	def extract_skills(text):
	doc = nlp_skills(text) # Use the dedicated skills model
	skills = [ent.text for ent in doc.ents if "SKILLS" in ent.label_] # Extract SKILLS entities
	# Clean up: Deduplicate and filter short/irrelevant
	skills = list(set([s.strip() for s in skills if len(s) > 2]))
	return skills

	def parse_cv(path):
	text = read_file(path)
	return {
	"skills": extract_skills(text),
	"experience": extract_experience(text),
	"location": extract_location(text)
	}