Spaces:

Akash076
/

Resume-parser

No application file

App Files Files Community

Akash076 commited on May 2, 2025

Commit

3e62707

verified ·

1 Parent(s): 08dd5c4

Upload pdf_parser.py

Browse files

Files changed (1) hide show

Src/pdf_parser.py +205 -0

Src/pdf_parser.py ADDED Viewed

	@@ -0,0 +1,205 @@

+import fitz
+from PIL import Image
+import pytesseract
+import re
+import io
+import json
+def extract_text_from_pdf(file_path):
+    text = ""
+    ocr_used = False
+    doc = fitz.open(file_path)
+    for page in doc:
+        page_text = page.get_text().strip()
+        if page_text:
+            text += page_text + "\n"
+        else:
+            ocr_used = True
+            pix = page.get_pixmap(dpi=300)
+            img = Image.open(io.BytesIO(pix.tobytes()))
+            ocr_text = pytesseract.image_to_string(img)
+            text += ocr_text + "\n"
+    return text, ocr_used
+def split_sections(text):
+    lines = [line.strip() for line in text.splitlines()]
+    section_headers = {
+        'experience': ['experience', 'work experience', 'professional experience'],
+        'education': ['education', 'academic qualifications', 'qualifications'],
+        'skills': ['skills', 'technical skills', 'key skills', 'core competencies'],
+        'certifications': ['certifications', 'certification', 'achievements'],
+        'projects': ['projects', 'project experience', 'personal projects', 'project']
+    }
+    sections = {key: "" for key in section_headers}
+    current_section = None
+    for line in lines:
+        if not line:
+            continue
+        lower_line = line.lower()
+        found_header = False
+        for sec, headers in section_headers.items():
+            for header in headers:
+                header = header.lower()
+                if (lower_line.startswith(header) or
+                    lower_line.endswith(header) or
+                    header in lower_line):
+                    current_section = sec
+                    found_header = True
+                    break
+            if found_header:
+                break
+        if found_header:
+            continue
+        if current_section:
+            sections[current_section] += line + "\n"
+    return sections
+def parse_skills(section_text, ocr_used=False):
+    if not section_text.strip():
+        return None, 0.0
+    # Try comma/pipe separated format
+    if re.search(r"[,|]", section_text):
+        skills = re.split(r"\s*[,|]\s*", section_text)
+        cleaned = [s.strip() for s in skills if s.strip()]
+        if cleaned:
+            return cleaned, 1.0
+    # Try line break separated format
+    lines = [line.strip() for line in section_text.splitlines() if line.strip()]
+    if lines:
+        return lines, 1.0
+    # Fallback to skills.json lookup
+    try:
+        with open("skills.json", "r") as f:
+            skills_list = json.load(f)
+    except FileNotFoundError:
+        skills_list = []
+    found_skills = []
+    text_lower = section_text.lower()
+    for skill in skills_list:
+        if re.search(rf"\b{re.escape(skill.lower())}\b", text_lower):
+            found_skills.append(skill)
+    return found_skills or None, 0.8 if found_skills else 0.0
+def parse_experience(section_text, ocr_used=False):
+    if not section_text.strip():
+        return None, 0.0
+    lines = [line for line in section_text.splitlines() if line.strip()]
+    exp_lines = []
+    for line in lines:
+        if re.search(r"\b(project|skill)\b", line, re.IGNORECASE):
+            continue
+        exp_lines.append(line)
+    if not exp_lines:
+        return None, 0.0
+    value = "\n".join(exp_lines).strip()
+    confidence = 0.9 if ocr_used else 1.0
+    return value, confidence
+def parse_education(section_text, ocr_used=False):
+    if not section_text.strip():
+        return None, 0.0
+    lines = [line for line in section_text.splitlines() if line.strip()]
+    value = "\n".join(lines).strip()
+    confidence = 0.9 if ocr_used else 1.0
+    return value, confidence
+def parse_certifications(section_text, ocr_used=False):
+    if not section_text.strip():
+        return None, 0.0
+    lines = [line for line in section_text.splitlines() if line.strip()]
+    value = "\n".join(lines).strip()
+    confidence = 0.9 if ocr_used else 1.0
+    return value, confidence
+def parse_projects(section_text, ocr_used=False):
+    if not section_text.strip():
+        return None, 0.0
+    lines = [line for line in section_text.splitlines() if line.strip()]
+    projects = []
+    current_proj = {"title": "", "description": ""}
+    for line in lines:
+        if re.match(r'(.*\d{4}.*|.*present.*|.*github.*)', line, re.IGNORECASE):
+            if current_proj["title"]:
+                projects.append(current_proj)
+            current_proj = {"title": line.strip(), "description": ""}
+        else:
+            current_proj["description"] += line + " "
+    if current_proj["title"]:
+        projects.append(current_proj)
+    result = []
+    for proj in projects:
+        title = proj["title"]
+        desc = proj["description"].strip()
+        entry = f"{title}: {desc}" if desc else title
+        result.append(entry)
+    return "\n".join(result) if result else None, 0.9 if ocr_used else 1.0
+def parse_header_fields(text):
+    lines = [line.strip() for line in text.splitlines()]
+    header_idx = len(lines)
+    section_keywords = ["objective", "summary", "experience", "education",
+                        "project", "skill", "certification", "interests"]
+    # Find first section header
+    for i, line in enumerate(lines):
+        if any(kw in line.lower() for kw in section_keywords):
+            header_idx = i
+            break
+    # Fixed regex pattern with proper parenthesis
+    name = ""
+    for line in lines[:min(header_idx, 8)]:  # Check first 8 lines before sections
+        if not line:
+            continue
+        # Improved regex pattern
+        if re.match(r'^[A-Z][a-z]+(?:\s+[A-Z][a-z]*)*$', line):  # Fixed pattern
+            name = line
+            break
+        # Fallback for ALL-CAPS names
+        if re.match(r'^[A-Z\s]{3,}$', line) and len(line.split()) >= 2:
+            name = line.title()
+            break
+    # Rest of the contact info parsing remains the same
+    email_match = re.search(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text)
+    phone_match = re.search(r'(?:\+?(\d{1,3}))?[-. (]*(\d{3})[-. )]*(\d{3})[-. ]*(\d{4})', text)
+    linkedin_match = re.search(r'(https?://)?(www\.)?linkedin\.com/in/[\w-]+', text)
+    return {
+        "name": {"value": name or None, "confidence": 0.99 if name else 0.0},
+        "email": {"value": email_match.group(0) if email_match else None, "confidence": 0.99 if email_match else 0.0},
+        "phone": {"value": phone_match.group(0) if phone_match else None, "confidence": 0.99 if phone_match else 0.0},
+        "linkedin": {"value": linkedin_match.group(0) if linkedin_match else None, "confidence": 0.99 if linkedin_match else 0.0},
+    }
+def parse_resume(file_path):
+    text, ocr_used = extract_text_from_pdf(file_path)
+    sections = split_sections(text)
+    header_data = parse_header_fields(text)
+    # Parse all sections
+    exp_val, exp_conf = parse_experience(sections.get('experience', ''), ocr_used)
+    edu_val, edu_conf = parse_education(sections.get('education', ''), ocr_used)
+    skills_val, skills_conf = parse_skills(sections.get('skills', ''), ocr_used)
+    proj_val, proj_conf = parse_projects(sections.get('projects', ''), ocr_used)
+    cert_val, cert_conf = parse_certifications(sections.get('certifications', ''), ocr_used)
+    # Combine results
+    result = {
+        **header_data,
+        "skills": {"value": skills_val, "confidence": skills_conf},
+        "experience": {"value": exp_val, "confidence": exp_conf},
+        "education": {"value": edu_val, "confidence": edu_conf},
+        "projects": {"value": proj_val, "confidence": proj_conf},
+        "certifications": {"value": cert_val, "confidence": cert_conf},
+    }
+    return result