import PyPDF2 import docx import io import re from pydantic import BaseModel from typing import List, Optional, Dict class Education(BaseModel): institution: str degree: str field_of_study: Optional[str] = None graduation_date: Optional[str] = None class WorkExperience(BaseModel): company: str position: str start_date: Optional[str] = None end_date: Optional[str] = None description: Optional[str] = None class PersonalInfo(BaseModel): name: Optional[str] = None email: Optional[str] = None phone: Optional[str] = None location: Optional[str] = None github: Optional[str] = None linkedin: Optional[str] = None portfolio: Optional[str] = None class ResumeData(BaseModel): personal_info: PersonalInfo education: List[Education] work_experience: List[WorkExperience] skills: List[str] certifications: List[str] raw_text: str class ResumeParser: def __init__(self): self.email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b' self.phone_pattern = r'(\+\d{1,3}[\s-]?)?\(?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}' self.github_pattern = r'github\.com/([A-Za-z0-9_-]+)' self.linkedin_pattern = r'linkedin\.com/in/([A-Za-z0-9_-]+)' def parse(self, uploaded_file) -> ResumeData: """Parse the uploaded resume file and extract key information""" # Extract text from file file_extension = uploaded_file.name.split('.')[-1].lower() if file_extension == 'pdf': raw_text = self._extract_text_from_pdf(uploaded_file) elif file_extension in ['docx', 'doc']: raw_text = self._extract_text_from_docx(uploaded_file) else: raise ValueError("Unsupported file format. Please upload a PDF or DOCX file.") # Extract components personal_info = self._extract_personal_info(raw_text) education = self._extract_education(raw_text) work_experience = self._extract_work_experience(raw_text) skills = self._extract_skills(raw_text) certifications = self._extract_certifications(raw_text) # Create and return ResumeData resume_data = ResumeData( personal_info=personal_info, education=education, work_experience=work_experience, skills=skills, certifications=certifications, raw_text=raw_text ) return resume_data def _extract_text_from_pdf(self, file) -> str: """Extract text from PDF file""" pdf_reader = PyPDF2.PdfReader(io.BytesIO(file.getvalue())) text = "" for page in pdf_reader.pages: text += page.extract_text() return text def _extract_text_from_docx(self, file) -> str: """Extract text from DOCX file""" doc = docx.Document(io.BytesIO(file.getvalue())) text = "" for paragraph in doc.paragraphs: text += paragraph.text + "\n" return text def _extract_personal_info(self, text) -> PersonalInfo: """Extract personal information from resume text""" # Basic extraction with regex email = re.search(self.email_pattern, text) phone = re.search(self.phone_pattern, text) github = re.search(self.github_pattern, text) linkedin = re.search(self.linkedin_pattern, text) # First line often contains the name lines = text.split('\n') name = lines[0].strip() if lines else None return PersonalInfo( name=name, email=email.group(0) if email else None, phone=phone.group(0) if phone else None, github=github.group(1) if github else None, linkedin=linkedin.group(1) if linkedin else None ) def _extract_education(self, text) -> List[Education]: """Extract education information from resume text""" # Simple implementation - in a real system this would be more sophisticated education_section = self._extract_section(text, ["EDUCATION", "Education", "ACADEMIC BACKGROUND"]) if not education_section: return [] # Very basic parsing - a real implementation would use more sophisticated NLP educations = [] lines = education_section.split('\n') current_education = None for line in lines: if not line.strip(): continue if any(degree in line for degree in ["Bachelor", "Master", "PhD", "B.S.", "M.S.", "Ph.D"]): if current_education: educations.append(current_education) parts = line.split(',') degree = parts[0].strip() if parts else line.strip() institution = parts[1].strip() if len(parts) > 1 else "" current_education = Education( institution=institution, degree=degree ) if current_education: educations.append(current_education) return educations def _extract_work_experience(self, text) -> List[WorkExperience]: """Extract work experience from resume text""" experience_section = self._extract_section(text, ["EXPERIENCE", "Experience", "WORK EXPERIENCE", "EMPLOYMENT"]) if not experience_section: return [] # Simple implementation experiences = [] lines = experience_section.split('\n') current_experience = None for line in lines: if not line.strip(): continue if re.search(r'\b(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{4}\b', line): if current_experience: experiences.append(current_experience) # Very simplistic parsing company_match = re.search(r'([A-Za-z0-9\s]+)', line) company = company_match.group(1).strip() if company_match else "Unknown Company" position_match = re.search(r'([A-Za-z\s]+)', line) position = position_match.group(1).strip() if position_match else "Unknown Position" current_experience = WorkExperience( company=company, position=position ) if current_experience: experiences.append(current_experience) return experiences def _extract_skills(self, text) -> List[str]: """Extract skills from resume text""" skills_section = self._extract_section(text, ["SKILLS", "Skills", "TECHNICAL SKILLS"]) if not skills_section: return [] # Simple split by commas and cleanup skills_text = skills_section.replace('\n', ' ') skills = [skill.strip() for skill in re.split(r'[,•]', skills_text) if skill.strip()] return skills def _extract_certifications(self, text) -> List[str]: """Extract certifications from resume text""" cert_section = self._extract_section(text, ["CERTIFICATIONS", "Certifications", "CERTIFICATES"]) if not cert_section: return [] # Simple split by newlines and cleanup certifications = [cert.strip() for cert in cert_section.split('\n') if cert.strip()] return certifications def _extract_section(self, text, section_headers) -> str: """Extract a section from the resume text based on headers""" lines = text.split('\n') section_text = "" in_section = False for i, line in enumerate(lines): # Check if this line contains a section header if any(header in line for header in section_headers): in_section = True continue # Check if we've reached the next section if in_section and i < len(lines) - 1: next_line = lines[i+1] if next_line.isupper() and len(next_line.strip()) > 0: break if in_section: section_text += line + "\n" return section_text.strip()