File size: 3,995 Bytes
4689a82
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import pdfplumber
import PyPDF2
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
import re
from typing import Dict, List

class PDFExtractor:
    """Extract text from PDF resumes with fallback mechanisms"""
    
    @staticmethod
    def extract_text(pdf_path: str) -> str:
        """Extract text from PDF using pdfplumber, fallback to PyPDF2, then OCR"""
        try:
            # Primary: pdfplumber
            with pdfplumber.open(pdf_path) as pdf:
                text = ""
                for page in pdf.pages:
                    text += page.extract_text() or ""
                
                if text.strip():
                    return PDFExtractor.normalize_text(text)
        except Exception as e:
            print(f"pdfplumber failed: {e}, trying PyPDF2...")
        
        try:
            # Fallback 1: PyPDF2
            with open(pdf_path, 'rb') as file:
                reader = PyPDF2.PdfReader(file)
                text = ""
                for page in reader.pages:
                    text += page.extract_text() or ""
                
                if text.strip():
                    return PDFExtractor.normalize_text(text)
        except Exception as e:
            print(f"PyPDF2 failed: {e}, trying OCR...")
        
        try:
            # Fallback 2: OCR with Tesseract
            print("Attempting OCR extraction (this may take a moment)...")
            images = convert_from_path(pdf_path, dpi=300)
            text = ""
            for i, image in enumerate(images):
                page_text = pytesseract.image_to_string(image, lang='eng')
                text += page_text + "\n"
                print(f"OCR page {i+1}: extracted {len(page_text)} chars")
            
            if text.strip():
                return PDFExtractor.normalize_text(text)
            else:
                raise Exception("OCR extraction returned empty text")
        except Exception as e:
            raise Exception(f"All extraction methods failed. Last error (OCR): {e}")
    
    @staticmethod
    def normalize_text(text: str) -> str:
        """Clean and normalize extracted text"""
        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text)
        # Remove special characters but keep punctuation
        text = re.sub(r'[^\w\s.,;:()\-@]', '', text)
        return text.strip()
    
    @staticmethod
    def detect_sections(text: str) -> Dict[str, str]:
        """Detect common resume sections"""
        sections = {
            'experience': '',
            'education': '',
            'skills': '',
            'projects': '',
            'summary': '',
            'other': ''
        }
        
        # Section headers patterns
        patterns = {
            'experience': r'(experience|work history|employment|professional experience)',
            'education': r'(education|academic|qualifications|degrees)',
            'skills': r'(skills|technical skills|competencies|expertise)',
            'projects': r'(projects|portfolio|work samples)',
            'summary': r'(summary|objective|profile|about)'
        }
        
        text_lower = text.lower()
        
        # Find section boundaries
        section_positions = []
        for section_name, pattern in patterns.items():
            matches = list(re.finditer(pattern, text_lower))
            for match in matches:
                section_positions.append((match.start(), section_name))
        
        # Sort by position
        section_positions.sort()
        
        # Extract section content
        for i, (start_pos, section_name) in enumerate(section_positions):
            end_pos = section_positions[i + 1][0] if i + 1 < len(section_positions) else len(text)
            sections[section_name] = text[start_pos:end_pos].strip()
        
        # If no sections detected, put everything in 'other'
        if not any(sections.values()):
            sections['other'] = text
        
        return sections