Spaces:

Akash076
/

Resume-parser

No application file

File size: 7,905 Bytes

3e62707

import fitz
from PIL import Image
import pytesseract
import re
import io
import json

def extract_text_from_pdf(file_path):
    text = ""
    ocr_used = False
    doc = fitz.open(file_path)
    for page in doc:
        page_text = page.get_text().strip()
        if page_text:
            text += page_text + "\n"
        else:
            ocr_used = True
            pix = page.get_pixmap(dpi=300)
            img = Image.open(io.BytesIO(pix.tobytes()))
            ocr_text = pytesseract.image_to_string(img)
            text += ocr_text + "\n"
    return text, ocr_used

def split_sections(text):
    lines = [line.strip() for line in text.splitlines()]
    section_headers = {
        'experience': ['experience', 'work experience', 'professional experience'],
        'education': ['education', 'academic qualifications', 'qualifications'],
        'skills': ['skills', 'technical skills', 'key skills', 'core competencies'],
        'certifications': ['certifications', 'certification', 'achievements'],
        'projects': ['projects', 'project experience', 'personal projects', 'project']
    }
    sections = {key: "" for key in section_headers}
    current_section = None
    
    for line in lines:
        if not line:
            continue
        lower_line = line.lower()
        found_header = False
        for sec, headers in section_headers.items():
            for header in headers:
                header = header.lower()
                if (lower_line.startswith(header) or 
                    lower_line.endswith(header) or 
                    header in lower_line):
                    current_section = sec
                    found_header = True
                    break
            if found_header:
                break
        if found_header:
            continue
        if current_section:
            sections[current_section] += line + "\n"
    return sections

def parse_skills(section_text, ocr_used=False):
    if not section_text.strip():
        return None, 0.0
    
    # Try comma/pipe separated format
    if re.search(r"[,|]", section_text):
        skills = re.split(r"\s*[,|]\s*", section_text)
        cleaned = [s.strip() for s in skills if s.strip()]
        if cleaned:
            return cleaned, 1.0
    
    # Try line break separated format
    lines = [line.strip() for line in section_text.splitlines() if line.strip()]
    if lines:
        return lines, 1.0
    
    # Fallback to skills.json lookup
    try:
        with open("skills.json", "r") as f:
            skills_list = json.load(f)
    except FileNotFoundError:
        skills_list = []
    
    found_skills = []
    text_lower = section_text.lower()
    for skill in skills_list:
        if re.search(rf"\b{re.escape(skill.lower())}\b", text_lower):
            found_skills.append(skill)
    
    return found_skills or None, 0.8 if found_skills else 0.0

def parse_experience(section_text, ocr_used=False):
    if not section_text.strip():
        return None, 0.0
    lines = [line for line in section_text.splitlines() if line.strip()]
    exp_lines = []
    for line in lines:
        if re.search(r"\b(project|skill)\b", line, re.IGNORECASE):
            continue
        exp_lines.append(line)
    if not exp_lines:
        return None, 0.0
    value = "\n".join(exp_lines).strip()
    confidence = 0.9 if ocr_used else 1.0
    return value, confidence

def parse_education(section_text, ocr_used=False):
    if not section_text.strip():
        return None, 0.0
    lines = [line for line in section_text.splitlines() if line.strip()]
    value = "\n".join(lines).strip()
    confidence = 0.9 if ocr_used else 1.0
    return value, confidence

def parse_certifications(section_text, ocr_used=False):
    if not section_text.strip():
        return None, 0.0
    lines = [line for line in section_text.splitlines() if line.strip()]
    value = "\n".join(lines).strip()
    confidence = 0.9 if ocr_used else 1.0
    return value, confidence

def parse_projects(section_text, ocr_used=False):
    if not section_text.strip():
        return None, 0.0
    lines = [line for line in section_text.splitlines() if line.strip()]
    projects = []
    current_proj = {"title": "", "description": ""}
    for line in lines:
        if re.match(r'(.*\d{4}.*|.*present.*|.*github.*)', line, re.IGNORECASE):
            if current_proj["title"]:
                projects.append(current_proj)
            current_proj = {"title": line.strip(), "description": ""}
        else:
            current_proj["description"] += line + " "
    if current_proj["title"]:
        projects.append(current_proj)
    result = []
    for proj in projects:
        title = proj["title"]
        desc = proj["description"].strip()
        entry = f"{title}: {desc}" if desc else title
        result.append(entry)
    return "\n".join(result) if result else None, 0.9 if ocr_used else 1.0

def parse_header_fields(text):
    lines = [line.strip() for line in text.splitlines()]
    header_idx = len(lines)
    section_keywords = ["objective", "summary", "experience", "education", 
                        "project", "skill", "certification", "interests"]
    
    # Find first section header
    for i, line in enumerate(lines):
        if any(kw in line.lower() for kw in section_keywords):
            header_idx = i
            break

    # Fixed regex pattern with proper parenthesis
    name = ""
    for line in lines[:min(header_idx, 8)]:  # Check first 8 lines before sections
        if not line:
            continue
            
        # Improved regex pattern
        if re.match(r'^[A-Z][a-z]+(?:\s+[A-Z][a-z]*)*$', line):  # Fixed pattern
            name = line
            break
            
        # Fallback for ALL-CAPS names
        if re.match(r'^[A-Z\s]{3,}$', line) and len(line.split()) >= 2:
            name = line.title()
            break

    # Rest of the contact info parsing remains the same
    email_match = re.search(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text)
    phone_match = re.search(r'(?:\+?(\d{1,3}))?[-. (]*(\d{3})[-. )]*(\d{3})[-. ]*(\d{4})', text)
    linkedin_match = re.search(r'(https?://)?(www\.)?linkedin\.com/in/[\w-]+', text)

    return {
        "name": {"value": name or None, "confidence": 0.99 if name else 0.0},
        "email": {"value": email_match.group(0) if email_match else None, "confidence": 0.99 if email_match else 0.0},
        "phone": {"value": phone_match.group(0) if phone_match else None, "confidence": 0.99 if phone_match else 0.0},
        "linkedin": {"value": linkedin_match.group(0) if linkedin_match else None, "confidence": 0.99 if linkedin_match else 0.0},
    }

def parse_resume(file_path):
    text, ocr_used = extract_text_from_pdf(file_path)
    sections = split_sections(text)
    header_data = parse_header_fields(text)

    # Parse all sections
    exp_val, exp_conf = parse_experience(sections.get('experience', ''), ocr_used)
    edu_val, edu_conf = parse_education(sections.get('education', ''), ocr_used)
    skills_val, skills_conf = parse_skills(sections.get('skills', ''), ocr_used)
    proj_val, proj_conf = parse_projects(sections.get('projects', ''), ocr_used)
    cert_val, cert_conf = parse_certifications(sections.get('certifications', ''), ocr_used)

    # Combine results
    result = {
        **header_data,
        "skills": {"value": skills_val, "confidence": skills_conf},
        "experience": {"value": exp_val, "confidence": exp_conf},
        "education": {"value": edu_val, "confidence": edu_conf},
        "projects": {"value": proj_val, "confidence": proj_conf},
        "certifications": {"value": cert_val, "confidence": cert_conf},
    }
    
    return result