Rs_mini_projrct / src /extract_tech.py
Harshilforworks's picture
Upload 14 files
e681f27 verified
"""Extract tech stacks from resumes (placed in backend/src).
This script can parse a PDF using the hybrid parser (`Parse_resume.py`) located
in the same `src` folder or read a pre-parsed text file and extract tech stack.
Run as:
python backend/src/extract_tech.py --pdf path/to/resume.pdf
"""
from __future__ import annotations
import argparse
import json
import re
from pathlib import Path
from typing import List
# Removed TECH_KEYWORDS whitelist per user request.
# We now use lightweight heuristics (and a small stopword filter) to accept tokens
# instead of relying on an explicit whitelist.
STOPWORDS = {'and', 'or', 'with', 'the', 'a', 'an', 'in', 'on', 'for', 'to', 'of', 'by', 'from', 'at'}
try:
# prefer spaCy stop words (fast, comprehensive)
from spacy.lang.en.stop_words import STOP_WORDS as _SPACY_STOPWORDS
STOPWORDS = set(_SPACY_STOPWORDS)
except Exception:
try:
import nltk
from nltk.corpus import stopwords as _nltk_stopwords
try:
# attempt to use already-installed stopwords
STOPWORDS = set(_nltk_stopwords.words('english'))
except LookupError:
# download corpus on demand (will require network)
nltk.download('stopwords')
STOPWORDS = set(_nltk_stopwords.words('english'))
except Exception:
# minimal fallback
STOPWORDS = {'and', 'or', 'with', 'the', 'a', 'an', 'in', 'on', 'for', 'to', 'of', 'by', 'from', 'at'}
def _token_clean(tok: str) -> str:
t = tok.strip()
t = re.sub(r"^[^A-Za-z0-9#+./-]+|[^A-Za-z0-9#+./-]+$", '', t)
return t
def extract_skills_from_text(text: str) -> List[str]:
if not text:
return []
lines = [l.strip() for l in text.splitlines()]
# Detect both Tech Stack lines and Skills/Technical Skills headings
heading_re = re.compile(r'^(skills|technical skills|tech\s*stack|techstack|technology\s*stack|technologies|tools|skillset|technical competencies)[:\s-]*$', re.I)
inline_heading_re = re.compile(r'^(skills|technical skills|tech\s*stack|techstack|technology\s*stack|technologies|tools|skillset)[:\s-]+(.+)$', re.I)
# explicit Tech Stack line detection (e.g. "Tech Stack: Python, AWS, Docker")
techstack_line_re = re.compile(r'\btech\s*stack\b\s*[::]\s*(.+)$', re.I)
candidates = []
i = 0
while i < len(lines):
line = lines[i]
if not line:
i += 1
continue
# explicit inline headings like "Tech Stack: X, Y"
m_inline = inline_heading_re.match(line)
if m_inline:
# If it's a tech stack inline heading the capture may already contain list
candidates.append(m_inline.group(2).strip())
i += 1
continue
# catch explicit 'Tech Stack: ...' anywhere in the line
m_tech = techstack_line_re.search(line)
if m_tech:
candidates.append(m_tech.group(1).strip())
i += 1
continue
if heading_re.match(line) or heading_re.match(line.lower()):
j = i + 1
buf = []
while j < len(lines) and lines[j].strip():
if re.match(r'^[A-Z][A-Za-z ]{1,40}$', lines[j]) and len(lines[j].split()) <= 4:
break
buf.append(lines[j])
j += 1
if buf:
candidates.append(' '.join(buf))
i = j
continue
i += 1
# We collect both explicit 'Tech Stack' lines and Skills/Technical Skills
# sections. If both appear they'll both contribute candidate strings.
seen = set()
result = []
split_re = re.compile(r'[,/;|\u2022]+')
for cand in candidates:
parts = split_re.split(cand)
for p in parts:
tok = _token_clean(p)
if not tok:
continue
key = tok.lower()
# ignore trivial stopwords
if key in STOPWORDS:
continue
# Heuristics-only acceptance:
# - must contain at least one letter
# - and either contain punctuation/digit (e.g. C++, .NET), or be longer than 1 char
accept = False
if re.search(r'[A-Za-z]', tok):
if re.search(r'[+.#-]', tok) or re.search(r'\d', tok) or len(tok) > 1:
accept = True
if accept and key not in seen:
seen.add(key)
result.append(tok)
return result
def extract_from_pdf(pdf_path: str) -> List[str]:
# Try multiple parsing approaches to handle deployment issues
# Method 1: Try the advanced hybrid parser
try:
# Use importlib for controlled import
import importlib.util
from pathlib import Path
current_dir = Path(__file__).resolve().parent
parse_resume_path = current_dir / "Parse_resume.py"
if parse_resume_path.exists():
spec = importlib.util.spec_from_file_location("Parse_resume_module", parse_resume_path)
if spec and spec.loader:
parse_resume_module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(parse_resume_module)
parse_document_hybrid = parse_resume_module.parse_document_hybrid
# Use the hybrid parser
res = parse_document_hybrid(pdf_path, save_parsed_text=False)
text = res.get('content', '')
skills = extract_skills_from_text(text)
return skills
except Exception as hybrid_error:
print(f"Hybrid parser failed ({hybrid_error}), trying fallback parser...")
# Method 2: Fallback to simple parser
try:
from simple_pdf_parser import fallback_parse_document
res = fallback_parse_document(pdf_path)
text = res.get('content', '')
skills = extract_skills_from_text(text)
return skills
except Exception as fallback_error:
# Method 3: Last resort - direct import attempt
try:
from Parse_resume import parse_document_hybrid
res = parse_document_hybrid(pdf_path, save_parsed_text=False)
text = res.get('content', '')
skills = extract_skills_from_text(text)
return skills
except Exception as direct_error:
raise RuntimeError(f"All parsing methods failed. Hybrid: {hybrid_error}, Fallback: {fallback_error}, Direct: {direct_error}")
# This should never be reached due to the exceptions above, but added for type safety
return []
def main(argv=None):
p = argparse.ArgumentParser(description='Extract tech stacks from resumes')
p.add_argument('--pdf', help='Path to PDF resume to parse')
p.add_argument('--text', help='Path to pre-parsed text file to read')
p.add_argument('--only-techstack', action='store_true', help='Only extract tokens from lines that mention "Tech Stack"')
p.add_argument('--out', help='Path to save JSON output')
args = p.parse_args(argv)
if not args.pdf and not args.text:
print('Provide either --pdf or --text')
return
if args.pdf:
skills = extract_from_pdf(args.pdf)
else:
txt = Path(args.text).read_text(encoding='utf-8')
skills = extract_skills_from_text(txt)
output = {'skills': skills}
print(json.dumps(output, ensure_ascii=False, indent=2))
if args.out:
Path(args.out).write_text(json.dumps(output, ensure_ascii=False, indent=2), encoding='utf-8')
if __name__ == '__main__':
main()