Spaces:
Sleeping
Sleeping
| """Extract tech stacks from resumes (placed in backend/src). | |
| This script can parse a PDF using the hybrid parser (`Parse_resume.py`) located | |
| in the same `src` folder or read a pre-parsed text file and extract tech stack. | |
| Run as: | |
| python backend/src/extract_tech.py --pdf path/to/resume.pdf | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import json | |
| import re | |
| from pathlib import Path | |
| from typing import List | |
| # Removed TECH_KEYWORDS whitelist per user request. | |
| # We now use lightweight heuristics (and a small stopword filter) to accept tokens | |
| # instead of relying on an explicit whitelist. | |
| STOPWORDS = {'and', 'or', 'with', 'the', 'a', 'an', 'in', 'on', 'for', 'to', 'of', 'by', 'from', 'at'} | |
| try: | |
| # prefer spaCy stop words (fast, comprehensive) | |
| from spacy.lang.en.stop_words import STOP_WORDS as _SPACY_STOPWORDS | |
| STOPWORDS = set(_SPACY_STOPWORDS) | |
| except Exception: | |
| try: | |
| import nltk | |
| from nltk.corpus import stopwords as _nltk_stopwords | |
| try: | |
| # attempt to use already-installed stopwords | |
| STOPWORDS = set(_nltk_stopwords.words('english')) | |
| except LookupError: | |
| # download corpus on demand (will require network) | |
| nltk.download('stopwords') | |
| STOPWORDS = set(_nltk_stopwords.words('english')) | |
| except Exception: | |
| # minimal fallback | |
| STOPWORDS = {'and', 'or', 'with', 'the', 'a', 'an', 'in', 'on', 'for', 'to', 'of', 'by', 'from', 'at'} | |
| def _token_clean(tok: str) -> str: | |
| t = tok.strip() | |
| t = re.sub(r"^[^A-Za-z0-9#+./-]+|[^A-Za-z0-9#+./-]+$", '', t) | |
| return t | |
| def extract_skills_from_text(text: str) -> List[str]: | |
| if not text: | |
| return [] | |
| lines = [l.strip() for l in text.splitlines()] | |
| # Detect both Tech Stack lines and Skills/Technical Skills headings | |
| heading_re = re.compile(r'^(skills|technical skills|tech\s*stack|techstack|technology\s*stack|technologies|tools|skillset|technical competencies)[:\s-]*$', re.I) | |
| inline_heading_re = re.compile(r'^(skills|technical skills|tech\s*stack|techstack|technology\s*stack|technologies|tools|skillset)[:\s-]+(.+)$', re.I) | |
| # explicit Tech Stack line detection (e.g. "Tech Stack: Python, AWS, Docker") | |
| techstack_line_re = re.compile(r'\btech\s*stack\b\s*[::]\s*(.+)$', re.I) | |
| candidates = [] | |
| i = 0 | |
| while i < len(lines): | |
| line = lines[i] | |
| if not line: | |
| i += 1 | |
| continue | |
| # explicit inline headings like "Tech Stack: X, Y" | |
| m_inline = inline_heading_re.match(line) | |
| if m_inline: | |
| # If it's a tech stack inline heading the capture may already contain list | |
| candidates.append(m_inline.group(2).strip()) | |
| i += 1 | |
| continue | |
| # catch explicit 'Tech Stack: ...' anywhere in the line | |
| m_tech = techstack_line_re.search(line) | |
| if m_tech: | |
| candidates.append(m_tech.group(1).strip()) | |
| i += 1 | |
| continue | |
| if heading_re.match(line) or heading_re.match(line.lower()): | |
| j = i + 1 | |
| buf = [] | |
| while j < len(lines) and lines[j].strip(): | |
| if re.match(r'^[A-Z][A-Za-z ]{1,40}$', lines[j]) and len(lines[j].split()) <= 4: | |
| break | |
| buf.append(lines[j]) | |
| j += 1 | |
| if buf: | |
| candidates.append(' '.join(buf)) | |
| i = j | |
| continue | |
| i += 1 | |
| # We collect both explicit 'Tech Stack' lines and Skills/Technical Skills | |
| # sections. If both appear they'll both contribute candidate strings. | |
| seen = set() | |
| result = [] | |
| split_re = re.compile(r'[,/;|\u2022]+') | |
| for cand in candidates: | |
| parts = split_re.split(cand) | |
| for p in parts: | |
| tok = _token_clean(p) | |
| if not tok: | |
| continue | |
| key = tok.lower() | |
| # ignore trivial stopwords | |
| if key in STOPWORDS: | |
| continue | |
| # Heuristics-only acceptance: | |
| # - must contain at least one letter | |
| # - and either contain punctuation/digit (e.g. C++, .NET), or be longer than 1 char | |
| accept = False | |
| if re.search(r'[A-Za-z]', tok): | |
| if re.search(r'[+.#-]', tok) or re.search(r'\d', tok) or len(tok) > 1: | |
| accept = True | |
| if accept and key not in seen: | |
| seen.add(key) | |
| result.append(tok) | |
| return result | |
| def extract_from_pdf(pdf_path: str) -> List[str]: | |
| # Import parser from same src folder | |
| try: | |
| from src.Parse_resume import parse_document_hybrid | |
| except Exception as e: | |
| raise RuntimeError(f"Could not import Parse_resume.parse_document_hybrid: {e}") | |
| res = parse_document_hybrid(pdf_path, save_parsed_text=False) | |
| text = res.get('content', '') | |
| skills = extract_skills_from_text(text) | |
| return skills | |
| def main(argv=None): | |
| p = argparse.ArgumentParser(description='Extract tech stacks from resumes') | |
| p.add_argument('--pdf', help='Path to PDF resume to parse') | |
| p.add_argument('--text', help='Path to pre-parsed text file to read') | |
| p.add_argument('--only-techstack', action='store_true', help='Only extract tokens from lines that mention "Tech Stack"') | |
| p.add_argument('--out', help='Path to save JSON output') | |
| args = p.parse_args(argv) | |
| if not args.pdf and not args.text: | |
| print('Provide either --pdf or --text') | |
| return | |
| if args.pdf: | |
| skills = extract_from_pdf(args.pdf) | |
| else: | |
| txt = Path(args.text).read_text(encoding='utf-8') | |
| skills = extract_skills_from_text(txt) | |
| output = {'skills': skills} | |
| print(json.dumps(output, ensure_ascii=False, indent=2)) | |
| if args.out: | |
| Path(args.out).write_text(json.dumps(output, ensure_ascii=False, indent=2), encoding='utf-8') | |
| if __name__ == '__main__': | |
| main() | |