Spaces:

nadinekhaled500
/

cv-parser-api

Sleeping

File size: 3,036 Bytes

646eb9d

import spacy
from huggingface_hub import snapshot_download
import pdfplumber
from docx import Document
import re

# Load general spaCy model for locations
nlp_general = spacy.load("en_core_web_lg")

# Download and load the skill-extractor model from Hugging Face
model_path = snapshot_download("amjad-awad/skill-extractor", repo_type="model")
nlp_skills = spacy.load(model_path)  # Dedicated model for skills

def read_pdf(path):
    text = []
    with pdfplumber.open(path) as pdf:
        for p in pdf.pages:
            text.append(p.extract_text() or "")
    return "\n".join(text)

def read_docx(path):
    doc = Document(path)
    return "\n".join([p.text for p in doc.paragraphs])

def read_file(path):
    if path.endswith(".pdf"):
        return read_pdf(path)
    elif path.endswith(".docx"):
        return read_docx(path)
    else:
        raise ValueError("Unsupported file type.")

def extract_location(text):
    doc = nlp_general(text)
    locs = [ent.text for ent in doc.ents if ent.label_ in ("GPE", "LOC")]
    return list(set(locs))

def extract_experience(text):
    exp_lines = []

    # Keywords to identify actual experience lines
    experience_keywords = [
        "experience", "intern", "trainee", "developer", "engineer", "project",
        "job", "specialist", "analyst", "manager", "consultant", "architect",
        "scientist", "coordinator", "assistant", "lead", "head", "director",
        "associate", "fellow", "program", "role", "position", "work", "co-op", "researcher", "officer"
    ]

    for line in text.split("\n"):
        original_line = line.strip()
        if not original_line:
            continue

        # Remove common bullet points and other leading non-alphanumeric chars
        processed_line = re.sub(r'^[\s\u2022\-\d\*\-–—\.]+\s*', '', original_line)

        # Convert to lowercase for case-insensitive keyword checking
        lower_processed_line = processed_line.lower()

        # Check if any experience keyword is present in the line
        if any(key in lower_processed_line for key in experience_keywords):
            # Basic cleaning: Normalize spaces
            cleaned_line = re.sub(r'\s+', ' ', processed_line).strip()
            # Further filter out lines that are too short or just numbers
            if len(cleaned_line) > 5 and any(c.isalpha() for c in cleaned_line):
                exp_lines.append(cleaned_line)

    # Use set to deduplicate, then convert back to list for consistent output
    return list(set(exp_lines))

def extract_skills(text):
    doc = nlp_skills(text)  # Use the dedicated skills model
    skills = [ent.text for ent in doc.ents if "SKILLS" in ent.label_]  # Extract SKILLS entities
    # Clean up: Deduplicate and filter short/irrelevant
    skills = list(set([s.strip() for s in skills if len(s) > 2]))
    return skills

def parse_cv(path):
    text = read_file(path)
    return {
        "skills": extract_skills(text),
        "experience": extract_experience(text),
        "location": extract_location(text)
    }