Spaces:

Harshilforworks
/

Rs-backend-ml

Sleeping

App Files Files Community

Rs-backend-ml / src /extract_tech.py

Harshilforworks

Upload 35 files

6518a94 verified 3 months ago

raw

history blame contribute delete

6.1 kB

	"""Extract tech stacks from resumes (placed in backend/src).

	This script can parse a PDF using the hybrid parser (`Parse_resume.py`) located
	in the same `src` folder or read a pre-parsed text file and extract tech stack.

	Run as:
	python backend/src/extract_tech.py --pdf path/to/resume.pdf

	"""

	from __future__ import annotations
	import argparse
	import json
	import re
	from pathlib import Path
	from typing import List


	# Removed TECH_KEYWORDS whitelist per user request.
	# We now use lightweight heuristics (and a small stopword filter) to accept tokens
	# instead of relying on an explicit whitelist.
	STOPWORDS = {'and', 'or', 'with', 'the', 'a', 'an', 'in', 'on', 'for', 'to', 'of', 'by', 'from', 'at'}

	try:
	# prefer spaCy stop words (fast, comprehensive)
	from spacy.lang.en.stop_words import STOP_WORDS as _SPACY_STOPWORDS
	STOPWORDS = set(_SPACY_STOPWORDS)
	except Exception:
	try:
	import nltk
	from nltk.corpus import stopwords as _nltk_stopwords
	try:
	# attempt to use already-installed stopwords
	STOPWORDS = set(_nltk_stopwords.words('english'))
	except LookupError:
	# download corpus on demand (will require network)
	nltk.download('stopwords')
	STOPWORDS = set(_nltk_stopwords.words('english'))
	except Exception:
	# minimal fallback
	STOPWORDS = {'and', 'or', 'with', 'the', 'a', 'an', 'in', 'on', 'for', 'to', 'of', 'by', 'from', 'at'}

	def _token_clean(tok: str) -> str:
	t = tok.strip()
	t = re.sub(r"^[^A-Za-z0-9#+./-]+\|[^A-Za-z0-9#+./-]+$", '', t)
	return t


	def extract_skills_from_text(text: str) -> List[str]:
	if not text:
	return []

	lines = [l.strip() for l in text.splitlines()]

	# Detect both Tech Stack lines and Skills/Technical Skills headings
	heading_re = re.compile(r'^(skills\|technical skills\|tech\sstack\|techstack\|technology\sstack\|technologies\|tools\|skillset\|technical competencies)[:\s-]*$', re.I)
	inline_heading_re = re.compile(r'^(skills\|technical skills\|tech\sstack\|techstack\|technology\sstack\|technologies\|tools\|skillset)[:\s-]+(.+)$', re.I)
	# explicit Tech Stack line detection (e.g. "Tech Stack: Python, AWS, Docker")
	techstack_line_re = re.compile(r'\btech\sstack\b\s[:：]\s*(.+)$', re.I)

	candidates = []
	i = 0
	while i < len(lines):
	line = lines[i]
	if not line:
	i += 1
	continue

	# explicit inline headings like "Tech Stack: X, Y"
	m_inline = inline_heading_re.match(line)
	if m_inline:
	# If it's a tech stack inline heading the capture may already contain list
	candidates.append(m_inline.group(2).strip())
	i += 1
	continue

	# catch explicit 'Tech Stack: ...' anywhere in the line
	m_tech = techstack_line_re.search(line)
	if m_tech:
	candidates.append(m_tech.group(1).strip())
	i += 1
	continue

	if heading_re.match(line) or heading_re.match(line.lower()):
	j = i + 1
	buf = []
	while j < len(lines) and lines[j].strip():
	if re.match(r'^[A-Z][A-Za-z ]{1,40}$', lines[j]) and len(lines[j].split()) <= 4:
	break
	buf.append(lines[j])
	j += 1
	if buf:
	candidates.append(' '.join(buf))
	i = j
	continue

	i += 1

	# We collect both explicit 'Tech Stack' lines and Skills/Technical Skills
	# sections. If both appear they'll both contribute candidate strings.

	seen = set()
	result = []
	split_re = re.compile(r'[,/;\|\u2022]+')


	for cand in candidates:
	parts = split_re.split(cand)
	for p in parts:
	tok = _token_clean(p)
	if not tok:
	continue
	key = tok.lower()

	# ignore trivial stopwords
	if key in STOPWORDS:
	continue

	# Heuristics-only acceptance:
	# - must contain at least one letter
	# - and either contain punctuation/digit (e.g. C++, .NET), or be longer than 1 char
	accept = False
	if re.search(r'[A-Za-z]', tok):
	if re.search(r'[+.#-]', tok) or re.search(r'\d', tok) or len(tok) > 1:
	accept = True

	if accept and key not in seen:
	seen.add(key)
	result.append(tok)

	return result


	def extract_from_pdf(pdf_path: str) -> List[str]:
	# Import parser from same src folder
	try:
	from src.Parse_resume import parse_document_hybrid
	except Exception as e:
	raise RuntimeError(f"Could not import Parse_resume.parse_document_hybrid: {e}")

	res = parse_document_hybrid(pdf_path, save_parsed_text=False)
	text = res.get('content', '')
	skills = extract_skills_from_text(text)
	return skills


	def main(argv=None):
	p = argparse.ArgumentParser(description='Extract tech stacks from resumes')
	p.add_argument('--pdf', help='Path to PDF resume to parse')
	p.add_argument('--text', help='Path to pre-parsed text file to read')
	p.add_argument('--only-techstack', action='store_true', help='Only extract tokens from lines that mention "Tech Stack"')
	p.add_argument('--out', help='Path to save JSON output')
	args = p.parse_args(argv)

	if not args.pdf and not args.text:
	print('Provide either --pdf or --text')
	return

	if args.pdf:
	skills = extract_from_pdf(args.pdf)
	else:
	txt = Path(args.text).read_text(encoding='utf-8')
	skills = extract_skills_from_text(txt)

	output = {'skills': skills}
	print(json.dumps(output, ensure_ascii=False, indent=2))
	if args.out:
	Path(args.out).write_text(json.dumps(output, ensure_ascii=False, indent=2), encoding='utf-8')


	if __name__ == '__main__':
	main()