| """ |
| resume_parser.py |
| ---------------- |
| Handles extracting text from PDF and DOCX resume files, |
| extracting candidate name, phone, email, and |
| cleaning the raw text for further processing. |
| """ |
|
|
| import re |
| import os |
| from PyPDF2 import PdfReader |
| from docx import Document |
|
|
|
|
| def extract_text_from_pdf(filepath): |
| """ |
| Extract all text from a PDF file. |
| |
| Args: |
| filepath (str): Path to the PDF file. |
| |
| Returns: |
| str: Extracted text from all pages. |
| """ |
| text = "" |
| try: |
| reader = PdfReader(filepath) |
| for page in reader.pages: |
| page_text = page.extract_text() |
| if page_text: |
| text += page_text + "\n" |
| except Exception as e: |
| print(f"[ERROR] Failed to read PDF: {e}") |
| return text |
|
|
|
|
| def extract_text_from_docx(filepath): |
| """ |
| Extract all text from a DOCX file, including tables. |
| |
| Args: |
| filepath (str): Path to the DOCX file. |
| |
| Returns: |
| str: Extracted text from all paragraphs and tables. |
| """ |
| text = "" |
| try: |
| doc = Document(filepath) |
|
|
| |
| for para in doc.paragraphs: |
| text += para.text + "\n" |
|
|
| |
| for table in doc.tables: |
| for row in table.rows: |
| row_text = " | ".join(cell.text.strip() for cell in row.cells if cell.text.strip()) |
| if row_text: |
| text += row_text + "\n" |
| except Exception as e: |
| print(f"[ERROR] Failed to read DOCX: {e}") |
| return text |
|
|
|
|
| def extract_text(filepath): |
| """ |
| Detect file type and extract text accordingly. |
| |
| Args: |
| filepath (str): Path to a PDF or DOCX file. |
| |
| Returns: |
| str: Extracted raw text. |
| |
| Raises: |
| ValueError: If the file format is not supported. |
| """ |
| ext = os.path.splitext(filepath)[1].lower() |
|
|
| if ext == ".pdf": |
| return extract_text_from_pdf(filepath) |
| elif ext == ".docx": |
| return extract_text_from_docx(filepath) |
| else: |
| raise ValueError(f"Unsupported file format: {ext}. Use PDF or DOCX.") |
|
|
|
|
| def extract_email(raw_text): |
| """ |
| Extract email addresses from resume text. |
| |
| Args: |
| raw_text (str): The raw extracted text. |
| |
| Returns: |
| str: First email found, or empty string. |
| """ |
| pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}' |
| emails = re.findall(pattern, raw_text) |
| return emails[0] if emails else "" |
|
|
|
|
| def extract_phone(raw_text): |
| """ |
| Extract phone numbers from resume text. |
| Supports Indian (+91), US (+1), and international formats. |
| |
| Args: |
| raw_text (str): The raw extracted text. |
| |
| Returns: |
| str: First phone number found, or empty string. |
| """ |
| patterns = [ |
| r'(?:\+91[\s-]?)?[6-9]\d{4}[\s-]?\d{5}', |
| r'(?:\+1[\s-]?)?\(?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}', |
| r'\+?\d{1,3}[\s.-]?\d{3,4}[\s.-]?\d{3,4}[\s.-]?\d{0,4}', |
| ] |
| for pattern in patterns: |
| phones = re.findall(pattern, raw_text) |
| if phones: |
| |
| return max(phones, key=len).strip() |
| return "" |
|
|
|
|
| def extract_candidate_name(raw_text): |
| """ |
| Attempt to extract the candidate's name from the first few lines of the resume. |
| Usually the first non-empty, non-email, non-phone line is the name. |
| |
| Args: |
| raw_text (str): The raw extracted text. |
| |
| Returns: |
| str: Candidate name or empty string. |
| """ |
| lines = raw_text.strip().split("\n") |
| for line in lines[:5]: |
| line = line.strip() |
| if not line: |
| continue |
| |
| if re.search(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', line): |
| continue |
| |
| if re.search(r'[\+]?\d[\d\s\-\(\)]{7,}', line): |
| continue |
| |
| skip_words = ["resume", "curriculum vitae", "cv", "objective", "summary", "profile"] |
| if line.lower().strip() in skip_words: |
| continue |
| |
| if len(line) < 60 and re.match(r'^[A-Za-z\s\.\-]+$', line): |
| return line.title() |
| return "" |
|
|
|
|
| def clean_text(raw_text): |
| """ |
| Clean and normalize extracted text. |
| |
| Steps: |
| 1. Convert to lowercase |
| 2. Remove URLs |
| 3. Remove email addresses |
| 4. Remove special characters (keep letters, numbers, spaces, and +, #, -, ., /) |
| 5. Collapse multiple spaces into one |
| 6. Strip leading/trailing whitespace |
| |
| Args: |
| raw_text (str): The raw extracted text. |
| |
| Returns: |
| str: Cleaned text ready for NLP processing. |
| """ |
| text = raw_text.lower() |
|
|
| |
| text = re.sub(r"http\S+|www\.\S+", "", text) |
|
|
| |
| text = re.sub(r"\S+@\S+\.\S+", "", text) |
|
|
| |
| text = re.sub(r"[^a-z0-9\s\+\#\-\.\/]", " ", text) |
|
|
| |
| text = re.sub(r"\s+", " ", text) |
|
|
| return text.strip() |