Spaces:

Harshilforworks
/

Rs-backend-ml

Sleeping

File size: 6,099 Bytes

6518a94

"""Extract tech stacks from resumes (placed in backend/src).



This script can parse a PDF using the hybrid parser (`Parse_resume.py`) located

in the same `src` folder or read a pre-parsed text file and extract tech stack.



Run as:

  python backend/src/extract_tech.py --pdf path/to/resume.pdf



"""

from __future__ import annotations
import argparse
import json
import re
from pathlib import Path
from typing import List


# Removed TECH_KEYWORDS whitelist per user request.
# We now use lightweight heuristics (and a small stopword filter) to accept tokens
# instead of relying on an explicit whitelist.
STOPWORDS = {'and', 'or', 'with', 'the', 'a', 'an', 'in', 'on', 'for', 'to', 'of', 'by', 'from', 'at'}

try:
    # prefer spaCy stop words (fast, comprehensive)
    from spacy.lang.en.stop_words import STOP_WORDS as _SPACY_STOPWORDS
    STOPWORDS = set(_SPACY_STOPWORDS)
except Exception:
    try:
        import nltk
        from nltk.corpus import stopwords as _nltk_stopwords
        try:
            # attempt to use already-installed stopwords
            STOPWORDS = set(_nltk_stopwords.words('english'))
        except LookupError:
            # download corpus on demand (will require network)
            nltk.download('stopwords')
            STOPWORDS = set(_nltk_stopwords.words('english'))
    except Exception:
        # minimal fallback
        STOPWORDS = {'and', 'or', 'with', 'the', 'a', 'an', 'in', 'on', 'for', 'to', 'of', 'by', 'from', 'at'}

def _token_clean(tok: str) -> str:
    t = tok.strip()
    t = re.sub(r"^[^A-Za-z0-9#+./-]+|[^A-Za-z0-9#+./-]+$", '', t)
    return t


def extract_skills_from_text(text: str) -> List[str]:
    if not text:
        return []

    lines = [l.strip() for l in text.splitlines()]

    # Detect both Tech Stack lines and Skills/Technical Skills headings
    heading_re = re.compile(r'^(skills|technical skills|tech\s*stack|techstack|technology\s*stack|technologies|tools|skillset|technical competencies)[:\s-]*$', re.I)
    inline_heading_re = re.compile(r'^(skills|technical skills|tech\s*stack|techstack|technology\s*stack|technologies|tools|skillset)[:\s-]+(.+)$', re.I)
    # explicit Tech Stack line detection (e.g. "Tech Stack: Python, AWS, Docker")
    techstack_line_re = re.compile(r'\btech\s*stack\b\s*[:：]\s*(.+)$', re.I)

    candidates = []
    i = 0
    while i < len(lines):
        line = lines[i]
        if not line:
            i += 1
            continue

        # explicit inline headings like "Tech Stack: X, Y"
        m_inline = inline_heading_re.match(line)
        if m_inline:
            # If it's a tech stack inline heading the capture may already contain list
            candidates.append(m_inline.group(2).strip())
            i += 1
            continue

        # catch explicit 'Tech Stack: ...' anywhere in the line
        m_tech = techstack_line_re.search(line)
        if m_tech:
            candidates.append(m_tech.group(1).strip())
            i += 1
            continue

        if heading_re.match(line) or heading_re.match(line.lower()):
            j = i + 1
            buf = []
            while j < len(lines) and lines[j].strip():
                if re.match(r'^[A-Z][A-Za-z ]{1,40}$', lines[j]) and len(lines[j].split()) <= 4:
                    break
                buf.append(lines[j])
                j += 1
            if buf:
                candidates.append(' '.join(buf))
            i = j
            continue

        i += 1

    # We collect both explicit 'Tech Stack' lines and Skills/Technical Skills
    # sections. If both appear they'll both contribute candidate strings.

    seen = set()
    result = []
    split_re = re.compile(r'[,/;|\u2022]+')


    for cand in candidates:
        parts = split_re.split(cand)
        for p in parts:
            tok = _token_clean(p)
            if not tok:
                continue
            key = tok.lower()

            # ignore trivial stopwords
            if key in STOPWORDS:
                continue

            # Heuristics-only acceptance:
            # - must contain at least one letter
            # - and either contain punctuation/digit (e.g. C++, .NET), or be longer than 1 char
            accept = False
            if re.search(r'[A-Za-z]', tok):
                if re.search(r'[+.#-]', tok) or re.search(r'\d', tok) or len(tok) > 1:
                    accept = True

            if accept and key not in seen:
                seen.add(key)
                result.append(tok)

    return result


def extract_from_pdf(pdf_path: str) -> List[str]:
    # Import parser from same src folder
    try:
        from src.Parse_resume import parse_document_hybrid
    except Exception as e:
        raise RuntimeError(f"Could not import Parse_resume.parse_document_hybrid: {e}")

    res = parse_document_hybrid(pdf_path, save_parsed_text=False)
    text = res.get('content', '')
    skills = extract_skills_from_text(text)
    return skills


def main(argv=None):
    p = argparse.ArgumentParser(description='Extract tech stacks from resumes')
    p.add_argument('--pdf', help='Path to PDF resume to parse')
    p.add_argument('--text', help='Path to pre-parsed text file to read')
    p.add_argument('--only-techstack', action='store_true', help='Only extract tokens from lines that mention "Tech Stack"')
    p.add_argument('--out', help='Path to save JSON output')
    args = p.parse_args(argv)

    if not args.pdf and not args.text:
        print('Provide either --pdf or --text')
        return

    if args.pdf:
        skills = extract_from_pdf(args.pdf)
    else:
        txt = Path(args.text).read_text(encoding='utf-8')
        skills = extract_skills_from_text(txt)

    output = {'skills': skills}
    print(json.dumps(output, ensure_ascii=False, indent=2))
    if args.out:
        Path(args.out).write_text(json.dumps(output, ensure_ascii=False, indent=2), encoding='utf-8')


if __name__ == '__main__':
    main()