Rs-backend-ml / src /extract_tech.py
Harshilforworks's picture
Upload 35 files
6518a94 verified
"""Extract tech stacks from resumes (placed in backend/src).
This script can parse a PDF using the hybrid parser (`Parse_resume.py`) located
in the same `src` folder or read a pre-parsed text file and extract tech stack.
Run as:
python backend/src/extract_tech.py --pdf path/to/resume.pdf
"""
from __future__ import annotations
import argparse
import json
import re
from pathlib import Path
from typing import List
# Removed TECH_KEYWORDS whitelist per user request.
# We now use lightweight heuristics (and a small stopword filter) to accept tokens
# instead of relying on an explicit whitelist.
STOPWORDS = {'and', 'or', 'with', 'the', 'a', 'an', 'in', 'on', 'for', 'to', 'of', 'by', 'from', 'at'}
try:
# prefer spaCy stop words (fast, comprehensive)
from spacy.lang.en.stop_words import STOP_WORDS as _SPACY_STOPWORDS
STOPWORDS = set(_SPACY_STOPWORDS)
except Exception:
try:
import nltk
from nltk.corpus import stopwords as _nltk_stopwords
try:
# attempt to use already-installed stopwords
STOPWORDS = set(_nltk_stopwords.words('english'))
except LookupError:
# download corpus on demand (will require network)
nltk.download('stopwords')
STOPWORDS = set(_nltk_stopwords.words('english'))
except Exception:
# minimal fallback
STOPWORDS = {'and', 'or', 'with', 'the', 'a', 'an', 'in', 'on', 'for', 'to', 'of', 'by', 'from', 'at'}
def _token_clean(tok: str) -> str:
t = tok.strip()
t = re.sub(r"^[^A-Za-z0-9#+./-]+|[^A-Za-z0-9#+./-]+$", '', t)
return t
def extract_skills_from_text(text: str) -> List[str]:
if not text:
return []
lines = [l.strip() for l in text.splitlines()]
# Detect both Tech Stack lines and Skills/Technical Skills headings
heading_re = re.compile(r'^(skills|technical skills|tech\s*stack|techstack|technology\s*stack|technologies|tools|skillset|technical competencies)[:\s-]*$', re.I)
inline_heading_re = re.compile(r'^(skills|technical skills|tech\s*stack|techstack|technology\s*stack|technologies|tools|skillset)[:\s-]+(.+)$', re.I)
# explicit Tech Stack line detection (e.g. "Tech Stack: Python, AWS, Docker")
techstack_line_re = re.compile(r'\btech\s*stack\b\s*[::]\s*(.+)$', re.I)
candidates = []
i = 0
while i < len(lines):
line = lines[i]
if not line:
i += 1
continue
# explicit inline headings like "Tech Stack: X, Y"
m_inline = inline_heading_re.match(line)
if m_inline:
# If it's a tech stack inline heading the capture may already contain list
candidates.append(m_inline.group(2).strip())
i += 1
continue
# catch explicit 'Tech Stack: ...' anywhere in the line
m_tech = techstack_line_re.search(line)
if m_tech:
candidates.append(m_tech.group(1).strip())
i += 1
continue
if heading_re.match(line) or heading_re.match(line.lower()):
j = i + 1
buf = []
while j < len(lines) and lines[j].strip():
if re.match(r'^[A-Z][A-Za-z ]{1,40}$', lines[j]) and len(lines[j].split()) <= 4:
break
buf.append(lines[j])
j += 1
if buf:
candidates.append(' '.join(buf))
i = j
continue
i += 1
# We collect both explicit 'Tech Stack' lines and Skills/Technical Skills
# sections. If both appear they'll both contribute candidate strings.
seen = set()
result = []
split_re = re.compile(r'[,/;|\u2022]+')
for cand in candidates:
parts = split_re.split(cand)
for p in parts:
tok = _token_clean(p)
if not tok:
continue
key = tok.lower()
# ignore trivial stopwords
if key in STOPWORDS:
continue
# Heuristics-only acceptance:
# - must contain at least one letter
# - and either contain punctuation/digit (e.g. C++, .NET), or be longer than 1 char
accept = False
if re.search(r'[A-Za-z]', tok):
if re.search(r'[+.#-]', tok) or re.search(r'\d', tok) or len(tok) > 1:
accept = True
if accept and key not in seen:
seen.add(key)
result.append(tok)
return result
def extract_from_pdf(pdf_path: str) -> List[str]:
# Import parser from same src folder
try:
from src.Parse_resume import parse_document_hybrid
except Exception as e:
raise RuntimeError(f"Could not import Parse_resume.parse_document_hybrid: {e}")
res = parse_document_hybrid(pdf_path, save_parsed_text=False)
text = res.get('content', '')
skills = extract_skills_from_text(text)
return skills
def main(argv=None):
p = argparse.ArgumentParser(description='Extract tech stacks from resumes')
p.add_argument('--pdf', help='Path to PDF resume to parse')
p.add_argument('--text', help='Path to pre-parsed text file to read')
p.add_argument('--only-techstack', action='store_true', help='Only extract tokens from lines that mention "Tech Stack"')
p.add_argument('--out', help='Path to save JSON output')
args = p.parse_args(argv)
if not args.pdf and not args.text:
print('Provide either --pdf or --text')
return
if args.pdf:
skills = extract_from_pdf(args.pdf)
else:
txt = Path(args.text).read_text(encoding='utf-8')
skills = extract_skills_from_text(txt)
output = {'skills': skills}
print(json.dumps(output, ensure_ascii=False, indent=2))
if args.out:
Path(args.out).write_text(json.dumps(output, ensure_ascii=False, indent=2), encoding='utf-8')
if __name__ == '__main__':
main()