Spaces:
No application file
No application file
File size: 7,905 Bytes
3e62707 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 |
import fitz
from PIL import Image
import pytesseract
import re
import io
import json
def extract_text_from_pdf(file_path):
text = ""
ocr_used = False
doc = fitz.open(file_path)
for page in doc:
page_text = page.get_text().strip()
if page_text:
text += page_text + "\n"
else:
ocr_used = True
pix = page.get_pixmap(dpi=300)
img = Image.open(io.BytesIO(pix.tobytes()))
ocr_text = pytesseract.image_to_string(img)
text += ocr_text + "\n"
return text, ocr_used
def split_sections(text):
lines = [line.strip() for line in text.splitlines()]
section_headers = {
'experience': ['experience', 'work experience', 'professional experience'],
'education': ['education', 'academic qualifications', 'qualifications'],
'skills': ['skills', 'technical skills', 'key skills', 'core competencies'],
'certifications': ['certifications', 'certification', 'achievements'],
'projects': ['projects', 'project experience', 'personal projects', 'project']
}
sections = {key: "" for key in section_headers}
current_section = None
for line in lines:
if not line:
continue
lower_line = line.lower()
found_header = False
for sec, headers in section_headers.items():
for header in headers:
header = header.lower()
if (lower_line.startswith(header) or
lower_line.endswith(header) or
header in lower_line):
current_section = sec
found_header = True
break
if found_header:
break
if found_header:
continue
if current_section:
sections[current_section] += line + "\n"
return sections
def parse_skills(section_text, ocr_used=False):
if not section_text.strip():
return None, 0.0
# Try comma/pipe separated format
if re.search(r"[,|]", section_text):
skills = re.split(r"\s*[,|]\s*", section_text)
cleaned = [s.strip() for s in skills if s.strip()]
if cleaned:
return cleaned, 1.0
# Try line break separated format
lines = [line.strip() for line in section_text.splitlines() if line.strip()]
if lines:
return lines, 1.0
# Fallback to skills.json lookup
try:
with open("skills.json", "r") as f:
skills_list = json.load(f)
except FileNotFoundError:
skills_list = []
found_skills = []
text_lower = section_text.lower()
for skill in skills_list:
if re.search(rf"\b{re.escape(skill.lower())}\b", text_lower):
found_skills.append(skill)
return found_skills or None, 0.8 if found_skills else 0.0
def parse_experience(section_text, ocr_used=False):
if not section_text.strip():
return None, 0.0
lines = [line for line in section_text.splitlines() if line.strip()]
exp_lines = []
for line in lines:
if re.search(r"\b(project|skill)\b", line, re.IGNORECASE):
continue
exp_lines.append(line)
if not exp_lines:
return None, 0.0
value = "\n".join(exp_lines).strip()
confidence = 0.9 if ocr_used else 1.0
return value, confidence
def parse_education(section_text, ocr_used=False):
if not section_text.strip():
return None, 0.0
lines = [line for line in section_text.splitlines() if line.strip()]
value = "\n".join(lines).strip()
confidence = 0.9 if ocr_used else 1.0
return value, confidence
def parse_certifications(section_text, ocr_used=False):
if not section_text.strip():
return None, 0.0
lines = [line for line in section_text.splitlines() if line.strip()]
value = "\n".join(lines).strip()
confidence = 0.9 if ocr_used else 1.0
return value, confidence
def parse_projects(section_text, ocr_used=False):
if not section_text.strip():
return None, 0.0
lines = [line for line in section_text.splitlines() if line.strip()]
projects = []
current_proj = {"title": "", "description": ""}
for line in lines:
if re.match(r'(.*\d{4}.*|.*present.*|.*github.*)', line, re.IGNORECASE):
if current_proj["title"]:
projects.append(current_proj)
current_proj = {"title": line.strip(), "description": ""}
else:
current_proj["description"] += line + " "
if current_proj["title"]:
projects.append(current_proj)
result = []
for proj in projects:
title = proj["title"]
desc = proj["description"].strip()
entry = f"{title}: {desc}" if desc else title
result.append(entry)
return "\n".join(result) if result else None, 0.9 if ocr_used else 1.0
def parse_header_fields(text):
lines = [line.strip() for line in text.splitlines()]
header_idx = len(lines)
section_keywords = ["objective", "summary", "experience", "education",
"project", "skill", "certification", "interests"]
# Find first section header
for i, line in enumerate(lines):
if any(kw in line.lower() for kw in section_keywords):
header_idx = i
break
# Fixed regex pattern with proper parenthesis
name = ""
for line in lines[:min(header_idx, 8)]: # Check first 8 lines before sections
if not line:
continue
# Improved regex pattern
if re.match(r'^[A-Z][a-z]+(?:\s+[A-Z][a-z]*)*$', line): # Fixed pattern
name = line
break
# Fallback for ALL-CAPS names
if re.match(r'^[A-Z\s]{3,}$', line) and len(line.split()) >= 2:
name = line.title()
break
# Rest of the contact info parsing remains the same
email_match = re.search(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text)
phone_match = re.search(r'(?:\+?(\d{1,3}))?[-. (]*(\d{3})[-. )]*(\d{3})[-. ]*(\d{4})', text)
linkedin_match = re.search(r'(https?://)?(www\.)?linkedin\.com/in/[\w-]+', text)
return {
"name": {"value": name or None, "confidence": 0.99 if name else 0.0},
"email": {"value": email_match.group(0) if email_match else None, "confidence": 0.99 if email_match else 0.0},
"phone": {"value": phone_match.group(0) if phone_match else None, "confidence": 0.99 if phone_match else 0.0},
"linkedin": {"value": linkedin_match.group(0) if linkedin_match else None, "confidence": 0.99 if linkedin_match else 0.0},
}
def parse_resume(file_path):
text, ocr_used = extract_text_from_pdf(file_path)
sections = split_sections(text)
header_data = parse_header_fields(text)
# Parse all sections
exp_val, exp_conf = parse_experience(sections.get('experience', ''), ocr_used)
edu_val, edu_conf = parse_education(sections.get('education', ''), ocr_used)
skills_val, skills_conf = parse_skills(sections.get('skills', ''), ocr_used)
proj_val, proj_conf = parse_projects(sections.get('projects', ''), ocr_used)
cert_val, cert_conf = parse_certifications(sections.get('certifications', ''), ocr_used)
# Combine results
result = {
**header_data,
"skills": {"value": skills_val, "confidence": skills_conf},
"experience": {"value": exp_val, "confidence": exp_conf},
"education": {"value": edu_val, "confidence": edu_conf},
"projects": {"value": proj_val, "confidence": proj_conf},
"certifications": {"value": cert_val, "confidence": cert_conf},
}
return result |