Resume-parser / Src /pdf_parser.py
Akash076's picture
Upload pdf_parser.py
3e62707 verified
import fitz
from PIL import Image
import pytesseract
import re
import io
import json
def extract_text_from_pdf(file_path):
text = ""
ocr_used = False
doc = fitz.open(file_path)
for page in doc:
page_text = page.get_text().strip()
if page_text:
text += page_text + "\n"
else:
ocr_used = True
pix = page.get_pixmap(dpi=300)
img = Image.open(io.BytesIO(pix.tobytes()))
ocr_text = pytesseract.image_to_string(img)
text += ocr_text + "\n"
return text, ocr_used
def split_sections(text):
lines = [line.strip() for line in text.splitlines()]
section_headers = {
'experience': ['experience', 'work experience', 'professional experience'],
'education': ['education', 'academic qualifications', 'qualifications'],
'skills': ['skills', 'technical skills', 'key skills', 'core competencies'],
'certifications': ['certifications', 'certification', 'achievements'],
'projects': ['projects', 'project experience', 'personal projects', 'project']
}
sections = {key: "" for key in section_headers}
current_section = None
for line in lines:
if not line:
continue
lower_line = line.lower()
found_header = False
for sec, headers in section_headers.items():
for header in headers:
header = header.lower()
if (lower_line.startswith(header) or
lower_line.endswith(header) or
header in lower_line):
current_section = sec
found_header = True
break
if found_header:
break
if found_header:
continue
if current_section:
sections[current_section] += line + "\n"
return sections
def parse_skills(section_text, ocr_used=False):
if not section_text.strip():
return None, 0.0
# Try comma/pipe separated format
if re.search(r"[,|]", section_text):
skills = re.split(r"\s*[,|]\s*", section_text)
cleaned = [s.strip() for s in skills if s.strip()]
if cleaned:
return cleaned, 1.0
# Try line break separated format
lines = [line.strip() for line in section_text.splitlines() if line.strip()]
if lines:
return lines, 1.0
# Fallback to skills.json lookup
try:
with open("skills.json", "r") as f:
skills_list = json.load(f)
except FileNotFoundError:
skills_list = []
found_skills = []
text_lower = section_text.lower()
for skill in skills_list:
if re.search(rf"\b{re.escape(skill.lower())}\b", text_lower):
found_skills.append(skill)
return found_skills or None, 0.8 if found_skills else 0.0
def parse_experience(section_text, ocr_used=False):
if not section_text.strip():
return None, 0.0
lines = [line for line in section_text.splitlines() if line.strip()]
exp_lines = []
for line in lines:
if re.search(r"\b(project|skill)\b", line, re.IGNORECASE):
continue
exp_lines.append(line)
if not exp_lines:
return None, 0.0
value = "\n".join(exp_lines).strip()
confidence = 0.9 if ocr_used else 1.0
return value, confidence
def parse_education(section_text, ocr_used=False):
if not section_text.strip():
return None, 0.0
lines = [line for line in section_text.splitlines() if line.strip()]
value = "\n".join(lines).strip()
confidence = 0.9 if ocr_used else 1.0
return value, confidence
def parse_certifications(section_text, ocr_used=False):
if not section_text.strip():
return None, 0.0
lines = [line for line in section_text.splitlines() if line.strip()]
value = "\n".join(lines).strip()
confidence = 0.9 if ocr_used else 1.0
return value, confidence
def parse_projects(section_text, ocr_used=False):
if not section_text.strip():
return None, 0.0
lines = [line for line in section_text.splitlines() if line.strip()]
projects = []
current_proj = {"title": "", "description": ""}
for line in lines:
if re.match(r'(.*\d{4}.*|.*present.*|.*github.*)', line, re.IGNORECASE):
if current_proj["title"]:
projects.append(current_proj)
current_proj = {"title": line.strip(), "description": ""}
else:
current_proj["description"] += line + " "
if current_proj["title"]:
projects.append(current_proj)
result = []
for proj in projects:
title = proj["title"]
desc = proj["description"].strip()
entry = f"{title}: {desc}" if desc else title
result.append(entry)
return "\n".join(result) if result else None, 0.9 if ocr_used else 1.0
def parse_header_fields(text):
lines = [line.strip() for line in text.splitlines()]
header_idx = len(lines)
section_keywords = ["objective", "summary", "experience", "education",
"project", "skill", "certification", "interests"]
# Find first section header
for i, line in enumerate(lines):
if any(kw in line.lower() for kw in section_keywords):
header_idx = i
break
# Fixed regex pattern with proper parenthesis
name = ""
for line in lines[:min(header_idx, 8)]: # Check first 8 lines before sections
if not line:
continue
# Improved regex pattern
if re.match(r'^[A-Z][a-z]+(?:\s+[A-Z][a-z]*)*$', line): # Fixed pattern
name = line
break
# Fallback for ALL-CAPS names
if re.match(r'^[A-Z\s]{3,}$', line) and len(line.split()) >= 2:
name = line.title()
break
# Rest of the contact info parsing remains the same
email_match = re.search(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text)
phone_match = re.search(r'(?:\+?(\d{1,3}))?[-. (]*(\d{3})[-. )]*(\d{3})[-. ]*(\d{4})', text)
linkedin_match = re.search(r'(https?://)?(www\.)?linkedin\.com/in/[\w-]+', text)
return {
"name": {"value": name or None, "confidence": 0.99 if name else 0.0},
"email": {"value": email_match.group(0) if email_match else None, "confidence": 0.99 if email_match else 0.0},
"phone": {"value": phone_match.group(0) if phone_match else None, "confidence": 0.99 if phone_match else 0.0},
"linkedin": {"value": linkedin_match.group(0) if linkedin_match else None, "confidence": 0.99 if linkedin_match else 0.0},
}
def parse_resume(file_path):
text, ocr_used = extract_text_from_pdf(file_path)
sections = split_sections(text)
header_data = parse_header_fields(text)
# Parse all sections
exp_val, exp_conf = parse_experience(sections.get('experience', ''), ocr_used)
edu_val, edu_conf = parse_education(sections.get('education', ''), ocr_used)
skills_val, skills_conf = parse_skills(sections.get('skills', ''), ocr_used)
proj_val, proj_conf = parse_projects(sections.get('projects', ''), ocr_used)
cert_val, cert_conf = parse_certifications(sections.get('certifications', ''), ocr_used)
# Combine results
result = {
**header_data,
"skills": {"value": skills_val, "confidence": skills_conf},
"experience": {"value": exp_val, "confidence": exp_conf},
"education": {"value": edu_val, "confidence": edu_conf},
"projects": {"value": proj_val, "confidence": proj_conf},
"certifications": {"value": cert_val, "confidence": cert_conf},
}
return result