Spaces:

Akash076
/

Resume-parser

No application file

App Files Files Community

Resume-parser / Src /pdf_parser.py

Akash076

Upload pdf_parser.py

3e62707 verified 9 months ago

raw

history blame contribute delete

7.91 kB

	import fitz
	from PIL import Image
	import pytesseract
	import re
	import io
	import json

	def extract_text_from_pdf(file_path):
	text = ""
	ocr_used = False
	doc = fitz.open(file_path)
	for page in doc:
	page_text = page.get_text().strip()
	if page_text:
	text += page_text + "\n"
	else:
	ocr_used = True
	pix = page.get_pixmap(dpi=300)
	img = Image.open(io.BytesIO(pix.tobytes()))
	ocr_text = pytesseract.image_to_string(img)
	text += ocr_text + "\n"
	return text, ocr_used

	def split_sections(text):
	lines = [line.strip() for line in text.splitlines()]
	section_headers = {
	'experience': ['experience', 'work experience', 'professional experience'],
	'education': ['education', 'academic qualifications', 'qualifications'],
	'skills': ['skills', 'technical skills', 'key skills', 'core competencies'],
	'certifications': ['certifications', 'certification', 'achievements'],
	'projects': ['projects', 'project experience', 'personal projects', 'project']
	}
	sections = {key: "" for key in section_headers}
	current_section = None

	for line in lines:
	if not line:
	continue
	lower_line = line.lower()
	found_header = False
	for sec, headers in section_headers.items():
	for header in headers:
	header = header.lower()
	if (lower_line.startswith(header) or
	lower_line.endswith(header) or
	header in lower_line):
	current_section = sec
	found_header = True
	break
	if found_header:
	break
	if found_header:
	continue
	if current_section:
	sections[current_section] += line + "\n"
	return sections

	def parse_skills(section_text, ocr_used=False):
	if not section_text.strip():
	return None, 0.0

	# Try comma/pipe separated format
	if re.search(r"[,\|]", section_text):
	skills = re.split(r"\s[,\|]\s", section_text)
	cleaned = [s.strip() for s in skills if s.strip()]
	if cleaned:
	return cleaned, 1.0

	# Try line break separated format
	lines = [line.strip() for line in section_text.splitlines() if line.strip()]
	if lines:
	return lines, 1.0

	# Fallback to skills.json lookup
	try:
	with open("skills.json", "r") as f:
	skills_list = json.load(f)
	except FileNotFoundError:
	skills_list = []

	found_skills = []
	text_lower = section_text.lower()
	for skill in skills_list:
	if re.search(rf"\b{re.escape(skill.lower())}\b", text_lower):
	found_skills.append(skill)

	return found_skills or None, 0.8 if found_skills else 0.0

	def parse_experience(section_text, ocr_used=False):
	if not section_text.strip():
	return None, 0.0
	lines = [line for line in section_text.splitlines() if line.strip()]
	exp_lines = []
	for line in lines:
	if re.search(r"\b(project\|skill)\b", line, re.IGNORECASE):
	continue
	exp_lines.append(line)
	if not exp_lines:
	return None, 0.0
	value = "\n".join(exp_lines).strip()
	confidence = 0.9 if ocr_used else 1.0
	return value, confidence

	def parse_education(section_text, ocr_used=False):
	if not section_text.strip():
	return None, 0.0
	lines = [line for line in section_text.splitlines() if line.strip()]
	value = "\n".join(lines).strip()
	confidence = 0.9 if ocr_used else 1.0
	return value, confidence

	def parse_certifications(section_text, ocr_used=False):
	if not section_text.strip():
	return None, 0.0
	lines = [line for line in section_text.splitlines() if line.strip()]
	value = "\n".join(lines).strip()
	confidence = 0.9 if ocr_used else 1.0
	return value, confidence

	def parse_projects(section_text, ocr_used=False):
	if not section_text.strip():
	return None, 0.0
	lines = [line for line in section_text.splitlines() if line.strip()]
	projects = []
	current_proj = {"title": "", "description": ""}
	for line in lines:
	if re.match(r'(.\d{4}.\|.present.\|.github.)', line, re.IGNORECASE):
	if current_proj["title"]:
	projects.append(current_proj)
	current_proj = {"title": line.strip(), "description": ""}
	else:
	current_proj["description"] += line + " "
	if current_proj["title"]:
	projects.append(current_proj)
	result = []
	for proj in projects:
	title = proj["title"]
	desc = proj["description"].strip()
	entry = f"{title}: {desc}" if desc else title
	result.append(entry)
	return "\n".join(result) if result else None, 0.9 if ocr_used else 1.0

	def parse_header_fields(text):
	lines = [line.strip() for line in text.splitlines()]
	header_idx = len(lines)
	section_keywords = ["objective", "summary", "experience", "education",
	"project", "skill", "certification", "interests"]

	# Find first section header
	for i, line in enumerate(lines):
	if any(kw in line.lower() for kw in section_keywords):
	header_idx = i
	break

	# Fixed regex pattern with proper parenthesis
	name = ""
	for line in lines[:min(header_idx, 8)]: # Check first 8 lines before sections
	if not line:
	continue

	# Improved regex pattern
	if re.match(r'^[A-Z][a-z]+(?:\s+[A-Z][a-z])$', line): # Fixed pattern
	name = line
	break

	# Fallback for ALL-CAPS names
	if re.match(r'^[A-Z\s]{3,}$', line) and len(line.split()) >= 2:
	name = line.title()
	break

	# Rest of the contact info parsing remains the same
	email_match = re.search(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z\|a-z]{2,}\b', text)
	phone_match = re.search(r'(?:\+?(\d{1,3}))?[-. (](\d{3})[-. )](\d{3})[-. ]*(\d{4})', text)
	linkedin_match = re.search(r'(https?://)?(www\.)?linkedin\.com/in/[\w-]+', text)

	return {
	"name": {"value": name or None, "confidence": 0.99 if name else 0.0},
	"email": {"value": email_match.group(0) if email_match else None, "confidence": 0.99 if email_match else 0.0},
	"phone": {"value": phone_match.group(0) if phone_match else None, "confidence": 0.99 if phone_match else 0.0},
	"linkedin": {"value": linkedin_match.group(0) if linkedin_match else None, "confidence": 0.99 if linkedin_match else 0.0},
	}

	def parse_resume(file_path):
	text, ocr_used = extract_text_from_pdf(file_path)
	sections = split_sections(text)
	header_data = parse_header_fields(text)

	# Parse all sections
	exp_val, exp_conf = parse_experience(sections.get('experience', ''), ocr_used)
	edu_val, edu_conf = parse_education(sections.get('education', ''), ocr_used)
	skills_val, skills_conf = parse_skills(sections.get('skills', ''), ocr_used)
	proj_val, proj_conf = parse_projects(sections.get('projects', ''), ocr_used)
	cert_val, cert_conf = parse_certifications(sections.get('certifications', ''), ocr_used)

	# Combine results
	result = {
	**header_data,
	"skills": {"value": skills_val, "confidence": skills_conf},
	"experience": {"value": exp_val, "confidence": exp_conf},
	"education": {"value": edu_val, "confidence": edu_conf},
	"projects": {"value": proj_val, "confidence": proj_conf},
	"certifications": {"value": cert_val, "confidence": cert_conf},
	}

	return result