Spaces:
No application file
No application file
| import fitz | |
| from PIL import Image | |
| import pytesseract | |
| import re | |
| import io | |
| import json | |
| def extract_text_from_pdf(file_path): | |
| text = "" | |
| ocr_used = False | |
| doc = fitz.open(file_path) | |
| for page in doc: | |
| page_text = page.get_text().strip() | |
| if page_text: | |
| text += page_text + "\n" | |
| else: | |
| ocr_used = True | |
| pix = page.get_pixmap(dpi=300) | |
| img = Image.open(io.BytesIO(pix.tobytes())) | |
| ocr_text = pytesseract.image_to_string(img) | |
| text += ocr_text + "\n" | |
| return text, ocr_used | |
| def split_sections(text): | |
| lines = [line.strip() for line in text.splitlines()] | |
| section_headers = { | |
| 'experience': ['experience', 'work experience', 'professional experience'], | |
| 'education': ['education', 'academic qualifications', 'qualifications'], | |
| 'skills': ['skills', 'technical skills', 'key skills', 'core competencies'], | |
| 'certifications': ['certifications', 'certification', 'achievements'], | |
| 'projects': ['projects', 'project experience', 'personal projects', 'project'] | |
| } | |
| sections = {key: "" for key in section_headers} | |
| current_section = None | |
| for line in lines: | |
| if not line: | |
| continue | |
| lower_line = line.lower() | |
| found_header = False | |
| for sec, headers in section_headers.items(): | |
| for header in headers: | |
| header = header.lower() | |
| if (lower_line.startswith(header) or | |
| lower_line.endswith(header) or | |
| header in lower_line): | |
| current_section = sec | |
| found_header = True | |
| break | |
| if found_header: | |
| break | |
| if found_header: | |
| continue | |
| if current_section: | |
| sections[current_section] += line + "\n" | |
| return sections | |
| def parse_skills(section_text, ocr_used=False): | |
| if not section_text.strip(): | |
| return None, 0.0 | |
| # Try comma/pipe separated format | |
| if re.search(r"[,|]", section_text): | |
| skills = re.split(r"\s*[,|]\s*", section_text) | |
| cleaned = [s.strip() for s in skills if s.strip()] | |
| if cleaned: | |
| return cleaned, 1.0 | |
| # Try line break separated format | |
| lines = [line.strip() for line in section_text.splitlines() if line.strip()] | |
| if lines: | |
| return lines, 1.0 | |
| # Fallback to skills.json lookup | |
| try: | |
| with open("skills.json", "r") as f: | |
| skills_list = json.load(f) | |
| except FileNotFoundError: | |
| skills_list = [] | |
| found_skills = [] | |
| text_lower = section_text.lower() | |
| for skill in skills_list: | |
| if re.search(rf"\b{re.escape(skill.lower())}\b", text_lower): | |
| found_skills.append(skill) | |
| return found_skills or None, 0.8 if found_skills else 0.0 | |
| def parse_experience(section_text, ocr_used=False): | |
| if not section_text.strip(): | |
| return None, 0.0 | |
| lines = [line for line in section_text.splitlines() if line.strip()] | |
| exp_lines = [] | |
| for line in lines: | |
| if re.search(r"\b(project|skill)\b", line, re.IGNORECASE): | |
| continue | |
| exp_lines.append(line) | |
| if not exp_lines: | |
| return None, 0.0 | |
| value = "\n".join(exp_lines).strip() | |
| confidence = 0.9 if ocr_used else 1.0 | |
| return value, confidence | |
| def parse_education(section_text, ocr_used=False): | |
| if not section_text.strip(): | |
| return None, 0.0 | |
| lines = [line for line in section_text.splitlines() if line.strip()] | |
| value = "\n".join(lines).strip() | |
| confidence = 0.9 if ocr_used else 1.0 | |
| return value, confidence | |
| def parse_certifications(section_text, ocr_used=False): | |
| if not section_text.strip(): | |
| return None, 0.0 | |
| lines = [line for line in section_text.splitlines() if line.strip()] | |
| value = "\n".join(lines).strip() | |
| confidence = 0.9 if ocr_used else 1.0 | |
| return value, confidence | |
| def parse_projects(section_text, ocr_used=False): | |
| if not section_text.strip(): | |
| return None, 0.0 | |
| lines = [line for line in section_text.splitlines() if line.strip()] | |
| projects = [] | |
| current_proj = {"title": "", "description": ""} | |
| for line in lines: | |
| if re.match(r'(.*\d{4}.*|.*present.*|.*github.*)', line, re.IGNORECASE): | |
| if current_proj["title"]: | |
| projects.append(current_proj) | |
| current_proj = {"title": line.strip(), "description": ""} | |
| else: | |
| current_proj["description"] += line + " " | |
| if current_proj["title"]: | |
| projects.append(current_proj) | |
| result = [] | |
| for proj in projects: | |
| title = proj["title"] | |
| desc = proj["description"].strip() | |
| entry = f"{title}: {desc}" if desc else title | |
| result.append(entry) | |
| return "\n".join(result) if result else None, 0.9 if ocr_used else 1.0 | |
| def parse_header_fields(text): | |
| lines = [line.strip() for line in text.splitlines()] | |
| header_idx = len(lines) | |
| section_keywords = ["objective", "summary", "experience", "education", | |
| "project", "skill", "certification", "interests"] | |
| # Find first section header | |
| for i, line in enumerate(lines): | |
| if any(kw in line.lower() for kw in section_keywords): | |
| header_idx = i | |
| break | |
| # Fixed regex pattern with proper parenthesis | |
| name = "" | |
| for line in lines[:min(header_idx, 8)]: # Check first 8 lines before sections | |
| if not line: | |
| continue | |
| # Improved regex pattern | |
| if re.match(r'^[A-Z][a-z]+(?:\s+[A-Z][a-z]*)*$', line): # Fixed pattern | |
| name = line | |
| break | |
| # Fallback for ALL-CAPS names | |
| if re.match(r'^[A-Z\s]{3,}$', line) and len(line.split()) >= 2: | |
| name = line.title() | |
| break | |
| # Rest of the contact info parsing remains the same | |
| email_match = re.search(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text) | |
| phone_match = re.search(r'(?:\+?(\d{1,3}))?[-. (]*(\d{3})[-. )]*(\d{3})[-. ]*(\d{4})', text) | |
| linkedin_match = re.search(r'(https?://)?(www\.)?linkedin\.com/in/[\w-]+', text) | |
| return { | |
| "name": {"value": name or None, "confidence": 0.99 if name else 0.0}, | |
| "email": {"value": email_match.group(0) if email_match else None, "confidence": 0.99 if email_match else 0.0}, | |
| "phone": {"value": phone_match.group(0) if phone_match else None, "confidence": 0.99 if phone_match else 0.0}, | |
| "linkedin": {"value": linkedin_match.group(0) if linkedin_match else None, "confidence": 0.99 if linkedin_match else 0.0}, | |
| } | |
| def parse_resume(file_path): | |
| text, ocr_used = extract_text_from_pdf(file_path) | |
| sections = split_sections(text) | |
| header_data = parse_header_fields(text) | |
| # Parse all sections | |
| exp_val, exp_conf = parse_experience(sections.get('experience', ''), ocr_used) | |
| edu_val, edu_conf = parse_education(sections.get('education', ''), ocr_used) | |
| skills_val, skills_conf = parse_skills(sections.get('skills', ''), ocr_used) | |
| proj_val, proj_conf = parse_projects(sections.get('projects', ''), ocr_used) | |
| cert_val, cert_conf = parse_certifications(sections.get('certifications', ''), ocr_used) | |
| # Combine results | |
| result = { | |
| **header_data, | |
| "skills": {"value": skills_val, "confidence": skills_conf}, | |
| "experience": {"value": exp_val, "confidence": exp_conf}, | |
| "education": {"value": edu_val, "confidence": edu_conf}, | |
| "projects": {"value": proj_val, "confidence": proj_conf}, | |
| "certifications": {"value": cert_val, "confidence": cert_conf}, | |
| } | |
| return result |