Spaces:

nehasingh1525
/

Resume-Screening-System

Sleeping

Resume-Screening-System / resume_parser.py

Neha Singh

resume-screening-system

263eb11 about 1 month ago

5.37 kB

	"""
	resume_parser.py
	----------------
	Handles extracting text from PDF and DOCX resume files,
	extracting candidate name, phone, email, and
	cleaning the raw text for further processing.
	"""

	import re
	import os
	from PyPDF2 import PdfReader
	from docx import Document


	def extract_text_from_pdf(filepath):
	"""
	Extract all text from a PDF file.

	Args:
	filepath (str): Path to the PDF file.

	Returns:
	str: Extracted text from all pages.
	"""
	text = ""
	try:
	reader = PdfReader(filepath)
	for page in reader.pages:
	page_text = page.extract_text()
	if page_text:
	text += page_text + "\n"
	except Exception as e:
	print(f"[ERROR] Failed to read PDF: {e}")
	return text


	def extract_text_from_docx(filepath):
	"""
	Extract all text from a DOCX file, including tables.

	Args:
	filepath (str): Path to the DOCX file.

	Returns:
	str: Extracted text from all paragraphs and tables.
	"""
	text = ""
	try:
	doc = Document(filepath)

	# Extract paragraphs
	for para in doc.paragraphs:
	text += para.text + "\n"

	# Extract text from tables (e.g. skills in tabular format)
	for table in doc.tables:
	for row in table.rows:
	row_text = " \| ".join(cell.text.strip() for cell in row.cells if cell.text.strip())
	if row_text:
	text += row_text + "\n"
	except Exception as e:
	print(f"[ERROR] Failed to read DOCX: {e}")
	return text


	def extract_text(filepath):
	"""
	Detect file type and extract text accordingly.

	Args:
	filepath (str): Path to a PDF or DOCX file.

	Returns:
	str: Extracted raw text.

	Raises:
	ValueError: If the file format is not supported.
	"""
	ext = os.path.splitext(filepath)[1].lower()

	if ext == ".pdf":
	return extract_text_from_pdf(filepath)
	elif ext == ".docx":
	return extract_text_from_docx(filepath)
	else:
	raise ValueError(f"Unsupported file format: {ext}. Use PDF or DOCX.")


	def extract_email(raw_text):
	"""
	Extract email addresses from resume text.

	Args:
	raw_text (str): The raw extracted text.

	Returns:
	str: First email found, or empty string.
	"""
	pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
	emails = re.findall(pattern, raw_text)
	return emails[0] if emails else ""


	def extract_phone(raw_text):
	"""
	Extract phone numbers from resume text.
	Supports Indian (+91), US (+1), and international formats.

	Args:
	raw_text (str): The raw extracted text.

	Returns:
	str: First phone number found, or empty string.
	"""
	patterns = [
	r'(?:\+91[\s-]?)?[6-9]\d{4}[\s-]?\d{5}', # Indian: +91 98765 43210
	r'(?:\+1[\s-]?)?$?\d{3}$?[\s.-]?\d{3}[\s.-]?\d{4}', # US: (555) 123-4567
	r'\+?\d{1,3}[\s.-]?\d{3,4}[\s.-]?\d{3,4}[\s.-]?\d{0,4}', # International
	]
	for pattern in patterns:
	phones = re.findall(pattern, raw_text)
	if phones:
	# Return the longest match (most likely a real phone number)
	return max(phones, key=len).strip()
	return ""


	def extract_candidate_name(raw_text):
	"""
	Attempt to extract the candidate's name from the first few lines of the resume.
	Usually the first non-empty, non-email, non-phone line is the name.

	Args:
	raw_text (str): The raw extracted text.

	Returns:
	str: Candidate name or empty string.
	"""
	lines = raw_text.strip().split("\n")
	for line in lines[:5]: # Check first 5 lines
	line = line.strip()
	if not line:
	continue
	# Skip if it's an email
	if re.search(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', line):
	continue
	# Skip if it's a phone number
	if re.search(r'[\+]?\d[\d\s\-]{7,}', line):
	continue
	# Skip common headers
	skip_words = ["resume", "curriculum vitae", "cv", "objective", "summary", "profile"]
	if line.lower().strip() in skip_words:
	continue
	# If line is short and contains mostly letters, it's likely a name
	if len(line) < 60 and re.match(r'^[A-Za-z\s\.\-]+$', line):
	return line.title()
	return ""


	def clean_text(raw_text):
	"""
	Clean and normalize extracted text.

	Steps:
	1. Convert to lowercase
	2. Remove URLs
	3. Remove email addresses
	4. Remove special characters (keep letters, numbers, spaces, and +, #, -, ., /)
	5. Collapse multiple spaces into one
	6. Strip leading/trailing whitespace

	Args:
	raw_text (str): The raw extracted text.

	Returns:
	str: Cleaned text ready for NLP processing.
	"""
	text = raw_text.lower()

	# Remove URLs
	text = re.sub(r"http\S+\|www\.\S+", "", text)

	# Remove email addresses
	text = re.sub(r"\S+@\S+\.\S+", "", text)

	# Remove special characters but keep letters, numbers, spaces, and specific symbols (+, #, -, ., /)
	text = re.sub(r"[^a-z0-9\s\+\#\-\.\/]", " ", text)

	# Collapse multiple spaces
	text = re.sub(r"\s+", " ", text)

	return text.strip()