File size: 7,905 Bytes
3e62707
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
import fitz
from PIL import Image
import pytesseract
import re
import io
import json

def extract_text_from_pdf(file_path):
    text = ""
    ocr_used = False
    doc = fitz.open(file_path)
    for page in doc:
        page_text = page.get_text().strip()
        if page_text:
            text += page_text + "\n"
        else:
            ocr_used = True
            pix = page.get_pixmap(dpi=300)
            img = Image.open(io.BytesIO(pix.tobytes()))
            ocr_text = pytesseract.image_to_string(img)
            text += ocr_text + "\n"
    return text, ocr_used

def split_sections(text):
    lines = [line.strip() for line in text.splitlines()]
    section_headers = {
        'experience': ['experience', 'work experience', 'professional experience'],
        'education': ['education', 'academic qualifications', 'qualifications'],
        'skills': ['skills', 'technical skills', 'key skills', 'core competencies'],
        'certifications': ['certifications', 'certification', 'achievements'],
        'projects': ['projects', 'project experience', 'personal projects', 'project']
    }
    sections = {key: "" for key in section_headers}
    current_section = None
    
    for line in lines:
        if not line:
            continue
        lower_line = line.lower()
        found_header = False
        for sec, headers in section_headers.items():
            for header in headers:
                header = header.lower()
                if (lower_line.startswith(header) or 
                    lower_line.endswith(header) or 
                    header in lower_line):
                    current_section = sec
                    found_header = True
                    break
            if found_header:
                break
        if found_header:
            continue
        if current_section:
            sections[current_section] += line + "\n"
    return sections

def parse_skills(section_text, ocr_used=False):
    if not section_text.strip():
        return None, 0.0
    
    # Try comma/pipe separated format
    if re.search(r"[,|]", section_text):
        skills = re.split(r"\s*[,|]\s*", section_text)
        cleaned = [s.strip() for s in skills if s.strip()]
        if cleaned:
            return cleaned, 1.0
    
    # Try line break separated format
    lines = [line.strip() for line in section_text.splitlines() if line.strip()]
    if lines:
        return lines, 1.0
    
    # Fallback to skills.json lookup
    try:
        with open("skills.json", "r") as f:
            skills_list = json.load(f)
    except FileNotFoundError:
        skills_list = []
    
    found_skills = []
    text_lower = section_text.lower()
    for skill in skills_list:
        if re.search(rf"\b{re.escape(skill.lower())}\b", text_lower):
            found_skills.append(skill)
    
    return found_skills or None, 0.8 if found_skills else 0.0

def parse_experience(section_text, ocr_used=False):
    if not section_text.strip():
        return None, 0.0
    lines = [line for line in section_text.splitlines() if line.strip()]
    exp_lines = []
    for line in lines:
        if re.search(r"\b(project|skill)\b", line, re.IGNORECASE):
            continue
        exp_lines.append(line)
    if not exp_lines:
        return None, 0.0
    value = "\n".join(exp_lines).strip()
    confidence = 0.9 if ocr_used else 1.0
    return value, confidence

def parse_education(section_text, ocr_used=False):
    if not section_text.strip():
        return None, 0.0
    lines = [line for line in section_text.splitlines() if line.strip()]
    value = "\n".join(lines).strip()
    confidence = 0.9 if ocr_used else 1.0
    return value, confidence

def parse_certifications(section_text, ocr_used=False):
    if not section_text.strip():
        return None, 0.0
    lines = [line for line in section_text.splitlines() if line.strip()]
    value = "\n".join(lines).strip()
    confidence = 0.9 if ocr_used else 1.0
    return value, confidence

def parse_projects(section_text, ocr_used=False):
    if not section_text.strip():
        return None, 0.0
    lines = [line for line in section_text.splitlines() if line.strip()]
    projects = []
    current_proj = {"title": "", "description": ""}
    for line in lines:
        if re.match(r'(.*\d{4}.*|.*present.*|.*github.*)', line, re.IGNORECASE):
            if current_proj["title"]:
                projects.append(current_proj)
            current_proj = {"title": line.strip(), "description": ""}
        else:
            current_proj["description"] += line + " "
    if current_proj["title"]:
        projects.append(current_proj)
    result = []
    for proj in projects:
        title = proj["title"]
        desc = proj["description"].strip()
        entry = f"{title}: {desc}" if desc else title
        result.append(entry)
    return "\n".join(result) if result else None, 0.9 if ocr_used else 1.0

def parse_header_fields(text):
    lines = [line.strip() for line in text.splitlines()]
    header_idx = len(lines)
    section_keywords = ["objective", "summary", "experience", "education", 
                        "project", "skill", "certification", "interests"]
    
    # Find first section header
    for i, line in enumerate(lines):
        if any(kw in line.lower() for kw in section_keywords):
            header_idx = i
            break

    # Fixed regex pattern with proper parenthesis
    name = ""
    for line in lines[:min(header_idx, 8)]:  # Check first 8 lines before sections
        if not line:
            continue
            
        # Improved regex pattern
        if re.match(r'^[A-Z][a-z]+(?:\s+[A-Z][a-z]*)*$', line):  # Fixed pattern
            name = line
            break
            
        # Fallback for ALL-CAPS names
        if re.match(r'^[A-Z\s]{3,}$', line) and len(line.split()) >= 2:
            name = line.title()
            break

    # Rest of the contact info parsing remains the same
    email_match = re.search(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text)
    phone_match = re.search(r'(?:\+?(\d{1,3}))?[-. (]*(\d{3})[-. )]*(\d{3})[-. ]*(\d{4})', text)
    linkedin_match = re.search(r'(https?://)?(www\.)?linkedin\.com/in/[\w-]+', text)

    return {
        "name": {"value": name or None, "confidence": 0.99 if name else 0.0},
        "email": {"value": email_match.group(0) if email_match else None, "confidence": 0.99 if email_match else 0.0},
        "phone": {"value": phone_match.group(0) if phone_match else None, "confidence": 0.99 if phone_match else 0.0},
        "linkedin": {"value": linkedin_match.group(0) if linkedin_match else None, "confidence": 0.99 if linkedin_match else 0.0},
    }

def parse_resume(file_path):
    text, ocr_used = extract_text_from_pdf(file_path)
    sections = split_sections(text)
    header_data = parse_header_fields(text)

    # Parse all sections
    exp_val, exp_conf = parse_experience(sections.get('experience', ''), ocr_used)
    edu_val, edu_conf = parse_education(sections.get('education', ''), ocr_used)
    skills_val, skills_conf = parse_skills(sections.get('skills', ''), ocr_used)
    proj_val, proj_conf = parse_projects(sections.get('projects', ''), ocr_used)
    cert_val, cert_conf = parse_certifications(sections.get('certifications', ''), ocr_used)

    # Combine results
    result = {
        **header_data,
        "skills": {"value": skills_val, "confidence": skills_conf},
        "experience": {"value": exp_val, "confidence": exp_conf},
        "education": {"value": edu_val, "confidence": edu_conf},
        "projects": {"value": proj_val, "confidence": proj_conf},
        "certifications": {"value": cert_val, "confidence": cert_conf},
    }
    
    return result