Resume-Screening-System / resume_parser.py
Neha Singh
resume-screening-system
263eb11
"""
resume_parser.py
----------------
Handles extracting text from PDF and DOCX resume files,
extracting candidate name, phone, email, and
cleaning the raw text for further processing.
"""
import re
import os
from PyPDF2 import PdfReader
from docx import Document
def extract_text_from_pdf(filepath):
"""
Extract all text from a PDF file.
Args:
filepath (str): Path to the PDF file.
Returns:
str: Extracted text from all pages.
"""
text = ""
try:
reader = PdfReader(filepath)
for page in reader.pages:
page_text = page.extract_text()
if page_text:
text += page_text + "\n"
except Exception as e:
print(f"[ERROR] Failed to read PDF: {e}")
return text
def extract_text_from_docx(filepath):
"""
Extract all text from a DOCX file, including tables.
Args:
filepath (str): Path to the DOCX file.
Returns:
str: Extracted text from all paragraphs and tables.
"""
text = ""
try:
doc = Document(filepath)
# Extract paragraphs
for para in doc.paragraphs:
text += para.text + "\n"
# Extract text from tables (e.g. skills in tabular format)
for table in doc.tables:
for row in table.rows:
row_text = " | ".join(cell.text.strip() for cell in row.cells if cell.text.strip())
if row_text:
text += row_text + "\n"
except Exception as e:
print(f"[ERROR] Failed to read DOCX: {e}")
return text
def extract_text(filepath):
"""
Detect file type and extract text accordingly.
Args:
filepath (str): Path to a PDF or DOCX file.
Returns:
str: Extracted raw text.
Raises:
ValueError: If the file format is not supported.
"""
ext = os.path.splitext(filepath)[1].lower()
if ext == ".pdf":
return extract_text_from_pdf(filepath)
elif ext == ".docx":
return extract_text_from_docx(filepath)
else:
raise ValueError(f"Unsupported file format: {ext}. Use PDF or DOCX.")
def extract_email(raw_text):
"""
Extract email addresses from resume text.
Args:
raw_text (str): The raw extracted text.
Returns:
str: First email found, or empty string.
"""
pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
emails = re.findall(pattern, raw_text)
return emails[0] if emails else ""
def extract_phone(raw_text):
"""
Extract phone numbers from resume text.
Supports Indian (+91), US (+1), and international formats.
Args:
raw_text (str): The raw extracted text.
Returns:
str: First phone number found, or empty string.
"""
patterns = [
r'(?:\+91[\s-]?)?[6-9]\d{4}[\s-]?\d{5}', # Indian: +91 98765 43210
r'(?:\+1[\s-]?)?\(?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}', # US: (555) 123-4567
r'\+?\d{1,3}[\s.-]?\d{3,4}[\s.-]?\d{3,4}[\s.-]?\d{0,4}', # International
]
for pattern in patterns:
phones = re.findall(pattern, raw_text)
if phones:
# Return the longest match (most likely a real phone number)
return max(phones, key=len).strip()
return ""
def extract_candidate_name(raw_text):
"""
Attempt to extract the candidate's name from the first few lines of the resume.
Usually the first non-empty, non-email, non-phone line is the name.
Args:
raw_text (str): The raw extracted text.
Returns:
str: Candidate name or empty string.
"""
lines = raw_text.strip().split("\n")
for line in lines[:5]: # Check first 5 lines
line = line.strip()
if not line:
continue
# Skip if it's an email
if re.search(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', line):
continue
# Skip if it's a phone number
if re.search(r'[\+]?\d[\d\s\-\(\)]{7,}', line):
continue
# Skip common headers
skip_words = ["resume", "curriculum vitae", "cv", "objective", "summary", "profile"]
if line.lower().strip() in skip_words:
continue
# If line is short and contains mostly letters, it's likely a name
if len(line) < 60 and re.match(r'^[A-Za-z\s\.\-]+$', line):
return line.title()
return ""
def clean_text(raw_text):
"""
Clean and normalize extracted text.
Steps:
1. Convert to lowercase
2. Remove URLs
3. Remove email addresses
4. Remove special characters (keep letters, numbers, spaces, and +, #, -, ., /)
5. Collapse multiple spaces into one
6. Strip leading/trailing whitespace
Args:
raw_text (str): The raw extracted text.
Returns:
str: Cleaned text ready for NLP processing.
"""
text = raw_text.lower()
# Remove URLs
text = re.sub(r"http\S+|www\.\S+", "", text)
# Remove email addresses
text = re.sub(r"\S+@\S+\.\S+", "", text)
# Remove special characters but keep letters, numbers, spaces, and specific symbols (+, #, -, ., /)
text = re.sub(r"[^a-z0-9\s\+\#\-\.\/]", " ", text)
# Collapse multiple spaces
text = re.sub(r"\s+", " ", text)
return text.strip()