| """ |
| PDF Parser Module for Resume Analysis |
| Provides advanced text extraction with OCR support for PDF documents. |
| """ |
|
|
| import os |
| import sys |
| from pathlib import Path |
| from typing import Optional, List, Dict, Any |
| import logging |
|
|
| try: |
| from PyPDF2 import PdfReader |
| PYPDF2_AVAILABLE = True |
| except ImportError: |
| PYPDF2_AVAILABLE = False |
| print("PyPDF2 not found. Install with: pip install PyPDF2") |
|
|
| try: |
| import pytesseract |
| from pdf2image import convert_from_path |
| OCR_AVAILABLE = True |
| except ImportError: |
| OCR_AVAILABLE = False |
| print("OCR libraries not found. Install with: pip install pytesseract pdf2image") |
|
|
| logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') |
| logger = logging.getLogger(__name__) |
|
|
| class PDFExtractor: |
| """Advanced PDF text extractor with OCR support and error handling.""" |
| |
| def __init__(self, pdf_path: str): |
| self.pdf_path = Path(pdf_path) |
| self.reader = None |
| self.is_encrypted = False |
| self.text_content = {} |
| |
| def validate_file(self) -> bool: |
| """Validate PDF file exists and is accessible.""" |
| if not self.pdf_path.exists(): |
| logger.error(f"PDF file not found: {self.pdf_path}") |
| return False |
| |
| if not self.pdf_path.is_file(): |
| logger.error(f"Path is not a file: {self.pdf_path}") |
| return False |
| |
| if self.pdf_path.stat().st_size == 0: |
| logger.error(f"PDF file is empty: {self.pdf_path}") |
| return False |
| |
| return True |
| |
| def load_pdf(self) -> bool: |
| """Load PDF with error handling.""" |
| try: |
| self.reader = PdfReader(self.pdf_path) |
| |
| |
| if self.reader.is_encrypted: |
| self.is_encrypted = True |
| logger.warning("PDF is encrypted. Text extraction may be limited.") |
| |
| logger.info(f"PDF loaded successfully. Pages: {len(self.reader.pages)}") |
| return True |
| |
| except Exception as e: |
| logger.error(f"Failed to load PDF: {e}") |
| return False |
| |
| def extract_text_from_page(self, page, page_num: int) -> str: |
| """Extract text from a single page using multiple methods.""" |
| text = "" |
| |
| |
| try: |
| text = page.extract_text() |
| if text and text.strip(): |
| logger.info(f"Page {page_num + 1}: Text extracted successfully") |
| return text |
| except Exception as e: |
| logger.warning(f"Page {page_num + 1}: Text extraction failed - {e}") |
| |
| |
| if not text and OCR_AVAILABLE: |
| try: |
| text = self._extract_text_with_ocr(page_num) |
| if text and text.strip(): |
| logger.info(f"Page {page_num + 1}: Text extracted via OCR") |
| return text |
| except Exception as e: |
| logger.warning(f"Page {page_num + 1}: OCR failed - {e}") |
| |
| |
| try: |
| |
| if hasattr(page, 'annotations'): |
| for annotation in page.annotations: |
| if hasattr(annotation, 'get_text'): |
| text += annotation.get_text() + "\n" |
| |
| |
| if hasattr(page, 'get_form_text_fields'): |
| form_fields = page.get_form_text_fields() |
| for field_name, field_value in form_fields.items(): |
| if field_value: |
| text += f"{field_name}: {field_value}\n" |
| |
| except Exception as e: |
| logger.debug(f"Alternative extraction methods failed: {e}") |
| |
| return text.strip() if text else "" |
| |
| def _extract_text_with_ocr(self, page_num: int) -> str: |
| """Extract text from scanned PDF using OCR.""" |
| try: |
| |
| images = convert_from_path( |
| self.pdf_path, |
| first_page=page_num + 1, |
| last_page=page_num + 1, |
| dpi=300 |
| ) |
| |
| if images: |
| |
| text = pytesseract.image_to_string(images[0], lang='eng') |
| return text |
| |
| except Exception as e: |
| logger.error(f"OCR processing failed: {e}") |
| |
| return "" |
| |
| def extract_all_text(self) -> Dict[str, Any]: |
| """Extract text from all pages with comprehensive metadata.""" |
| if not self.validate_file(): |
| return {"error": "Invalid PDF file"} |
| |
| if not self.load_pdf(): |
| return {"error": "Failed to load PDF"} |
| |
| result = { |
| "file_path": str(self.pdf_path), |
| "total_pages": len(self.reader.pages), |
| "is_encrypted": self.is_encrypted, |
| "pages": {}, |
| "full_text": "", |
| "metadata": {} |
| } |
| |
| |
| try: |
| if self.reader.metadata: |
| result["metadata"] = { |
| "title": self.reader.metadata.get('/Title', ''), |
| "author": self.reader.metadata.get('/Author', ''), |
| "subject": self.reader.metadata.get('/Subject', ''), |
| "creator": self.reader.metadata.get('/Creator', ''), |
| "producer": self.reader.metadata.get('/Producer', ''), |
| "creation_date": self.reader.metadata.get('/CreationDate', ''), |
| "modification_date": self.reader.metadata.get('/ModDate', '') |
| } |
| except Exception as e: |
| logger.warning(f"Failed to extract metadata: {e}") |
| |
| |
| for page_num, page in enumerate(self.reader.pages): |
| page_text = self.extract_text_from_page(page, page_num) |
| |
| result["pages"][page_num + 1] = { |
| "text": page_text, |
| "has_text": bool(page_text.strip()), |
| "extraction_method": "direct" if page_text else "none" |
| } |
| |
| result["full_text"] += f"\n--- Page {page_num + 1} ---\n{page_text}\n" |
| |
| return result |
| |
| def save_extracted_text(self, output_path: Optional[str] = None) -> str: |
| """Save extracted text to a file.""" |
| result = self.extract_all_text() |
| |
| if "error" in result: |
| logger.error(f"Cannot save: {result['error']}") |
| return "" |
| |
| if not output_path: |
| output_path = self.pdf_path.with_suffix('.txt') |
| |
| try: |
| with open(output_path, 'w', encoding='utf-8') as f: |
| f.write(f"PDF Text Extraction Results\n") |
| f.write(f"File: {result['file_path']}\n") |
| f.write(f"Pages: {result['total_pages']}\n") |
| f.write(f"Encrypted: {result['is_encrypted']}\n") |
| f.write(f"Extracted on: {result.get('extraction_date', 'Unknown')}\n") |
| f.write("=" * 50 + "\n\n") |
| f.write(result['full_text']) |
| |
| logger.info(f"Text saved to: {output_path}") |
| return str(output_path) |
| |
| except Exception as e: |
| logger.error(f"Failed to save text: {e}") |
| return "" |
|
|
| def extract_resume_text(file_path: str) -> Dict[str, Any]: |
| """ |
| Extract text from a single resume file. |
| |
| Args: |
| file_path: Path to the resume file |
| |
| Returns: |
| Dict containing extraction results with keys: |
| - success: Boolean indicating if extraction was successful |
| - file_path: Original file path |
| - text: Extracted text content |
| - metadata: PDF metadata if available |
| - pages: Page-by-page extraction details |
| - error: Error message if extraction failed |
| """ |
| try: |
| extractor = PDFExtractor(file_path) |
| result = extractor.extract_all_text() |
| |
| if "error" in result: |
| return { |
| "success": False, |
| "file_path": file_path, |
| "error": result["error"] |
| } |
| |
| return { |
| "success": True, |
| "file_path": file_path, |
| "text": result["full_text"], |
| "metadata": result["metadata"], |
| "pages": result["pages"], |
| "total_pages": result["total_pages"], |
| "is_encrypted": result["is_encrypted"] |
| } |
| |
| except Exception as e: |
| logger.error(f"Failed to extract text from {file_path}: {e}") |
| return { |
| "success": False, |
| "file_path": file_path, |
| "error": str(e) |
| } |
|
|
| def process_batch_resumes(file_paths: List[str]) -> List[Dict[str, Any]]: |
| """ |
| Process multiple resume files in batch. |
| |
| Args: |
| file_paths: List of file paths to process |
| |
| Returns: |
| List of extraction results for each file |
| """ |
| results = [] |
| total_files = len(file_paths) |
| |
| logger.info(f"Starting batch processing of {total_files} files") |
| |
| for i, file_path in enumerate(file_paths, 1): |
| logger.info(f"Processing file {i}/{total_files}: {file_path}") |
| result = extract_resume_text(file_path) |
| results.append(result) |
| |
| if result["success"]: |
| logger.info(f"✓ Successfully processed: {file_path}") |
| else: |
| logger.warning(f"✗ Failed to process: {file_path} - {result['error']}") |
| |
| |
| successful = sum(1 for r in results if r["success"]) |
| failed = total_files - successful |
| |
| logger.info(f"Batch processing complete: {successful} successful, {failed} failed") |
| |
| return results |
|
|
| def extract_resume_sections(text: str) -> Dict[str, str]: |
| """ |
| Extract structured sections from resume text. |
| |
| Args: |
| text: Raw resume text |
| |
| Returns: |
| Dict with structured sections (skills, experience, education, etc.) |
| """ |
| sections = { |
| "contact_info": "", |
| "skills": "", |
| "experience": "", |
| "education": "", |
| "summary": "", |
| "other": "" |
| } |
| |
| |
| lines = text.split('\n') |
| current_section = "other" |
| |
| for line in lines: |
| line_lower = line.lower().strip() |
| |
| |
| if any(keyword in line_lower for keyword in ['skill', 'technology', 'programming', 'framework']): |
| current_section = "skills" |
| elif any(keyword in line_lower for keyword in ['experience', 'work', 'employment', 'job']): |
| current_section = "experience" |
| elif any(keyword in line_lower for keyword in ['education', 'degree', 'university', 'college', 'school']): |
| current_section = "education" |
| elif any(keyword in line_lower for keyword in ['summary', 'profile', 'objective', 'about']): |
| current_section = "summary" |
| elif any(keyword in line_lower for keyword in ['email', 'phone', '@', 'linkedin', 'github']): |
| current_section = "contact_info" |
| |
| |
| if line.strip(): |
| sections[current_section] += line + "\n" |
| |
| |
| for key in sections: |
| sections[key] = sections[key].strip() |
| |
| return sections |
|
|
| def main(): |
| """Main function for command line usage (kept for backward compatibility).""" |
| if len(sys.argv) > 1: |
| pdf_path = sys.argv[1] |
| result = extract_resume_text(pdf_path) |
| |
| if result["success"]: |
| print(f"✓ Successfully extracted text from: {pdf_path}") |
| print(f"Text length: {len(result['text'])} characters") |
| print(f"Pages: {result['total_pages']}") |
| else: |
| print(f"✗ Failed to extract text: {result['error']}") |
| else: |
| print("Usage: python pdf_parser.py <file_path>") |
| print("For batch processing, use the programmatic functions directly.") |
|
|
| if __name__ == "__main__": |
| main() |