Spaces:
Sleeping
Sleeping
| import PyPDF2 | |
| import docx | |
| import io | |
| import re | |
| from pydantic import BaseModel | |
| from typing import List, Optional, Dict | |
| class Education(BaseModel): | |
| institution: str | |
| degree: str | |
| field_of_study: Optional[str] = None | |
| graduation_date: Optional[str] = None | |
| class WorkExperience(BaseModel): | |
| company: str | |
| position: str | |
| start_date: Optional[str] = None | |
| end_date: Optional[str] = None | |
| description: Optional[str] = None | |
| class PersonalInfo(BaseModel): | |
| name: Optional[str] = None | |
| email: Optional[str] = None | |
| phone: Optional[str] = None | |
| location: Optional[str] = None | |
| github: Optional[str] = None | |
| linkedin: Optional[str] = None | |
| portfolio: Optional[str] = None | |
| class ResumeData(BaseModel): | |
| personal_info: PersonalInfo | |
| education: List[Education] | |
| work_experience: List[WorkExperience] | |
| skills: List[str] | |
| certifications: List[str] | |
| raw_text: str | |
| class ResumeParser: | |
| def __init__(self): | |
| self.email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b' | |
| self.phone_pattern = r'(\+\d{1,3}[\s-]?)?\(?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}' | |
| self.github_pattern = r'github\.com/([A-Za-z0-9_-]+)' | |
| self.linkedin_pattern = r'linkedin\.com/in/([A-Za-z0-9_-]+)' | |
| def parse(self, uploaded_file) -> ResumeData: | |
| """Parse the uploaded resume file and extract key information""" | |
| # Extract text from file | |
| file_extension = uploaded_file.name.split('.')[-1].lower() | |
| if file_extension == 'pdf': | |
| raw_text = self._extract_text_from_pdf(uploaded_file) | |
| elif file_extension in ['docx', 'doc']: | |
| raw_text = self._extract_text_from_docx(uploaded_file) | |
| else: | |
| raise ValueError("Unsupported file format. Please upload a PDF or DOCX file.") | |
| # Extract components | |
| personal_info = self._extract_personal_info(raw_text) | |
| education = self._extract_education(raw_text) | |
| work_experience = self._extract_work_experience(raw_text) | |
| skills = self._extract_skills(raw_text) | |
| certifications = self._extract_certifications(raw_text) | |
| # Create and return ResumeData | |
| resume_data = ResumeData( | |
| personal_info=personal_info, | |
| education=education, | |
| work_experience=work_experience, | |
| skills=skills, | |
| certifications=certifications, | |
| raw_text=raw_text | |
| ) | |
| return resume_data | |
| def _extract_text_from_pdf(self, file) -> str: | |
| """Extract text from PDF file""" | |
| pdf_reader = PyPDF2.PdfReader(io.BytesIO(file.getvalue())) | |
| text = "" | |
| for page in pdf_reader.pages: | |
| text += page.extract_text() | |
| return text | |
| def _extract_text_from_docx(self, file) -> str: | |
| """Extract text from DOCX file""" | |
| doc = docx.Document(io.BytesIO(file.getvalue())) | |
| text = "" | |
| for paragraph in doc.paragraphs: | |
| text += paragraph.text + "\n" | |
| return text | |
| def _extract_personal_info(self, text) -> PersonalInfo: | |
| """Extract personal information from resume text""" | |
| # Basic extraction with regex | |
| email = re.search(self.email_pattern, text) | |
| phone = re.search(self.phone_pattern, text) | |
| github = re.search(self.github_pattern, text) | |
| linkedin = re.search(self.linkedin_pattern, text) | |
| # First line often contains the name | |
| lines = text.split('\n') | |
| name = lines[0].strip() if lines else None | |
| return PersonalInfo( | |
| name=name, | |
| email=email.group(0) if email else None, | |
| phone=phone.group(0) if phone else None, | |
| github=github.group(1) if github else None, | |
| linkedin=linkedin.group(1) if linkedin else None | |
| ) | |
| def _extract_education(self, text) -> List[Education]: | |
| """Extract education information from resume text""" | |
| # Simple implementation - in a real system this would be more sophisticated | |
| education_section = self._extract_section(text, ["EDUCATION", "Education", "ACADEMIC BACKGROUND"]) | |
| if not education_section: | |
| return [] | |
| # Very basic parsing - a real implementation would use more sophisticated NLP | |
| educations = [] | |
| lines = education_section.split('\n') | |
| current_education = None | |
| for line in lines: | |
| if not line.strip(): | |
| continue | |
| if any(degree in line for degree in ["Bachelor", "Master", "PhD", "B.S.", "M.S.", "Ph.D"]): | |
| if current_education: | |
| educations.append(current_education) | |
| parts = line.split(',') | |
| degree = parts[0].strip() if parts else line.strip() | |
| institution = parts[1].strip() if len(parts) > 1 else "" | |
| current_education = Education( | |
| institution=institution, | |
| degree=degree | |
| ) | |
| if current_education: | |
| educations.append(current_education) | |
| return educations | |
| def _extract_work_experience(self, text) -> List[WorkExperience]: | |
| """Extract work experience from resume text""" | |
| experience_section = self._extract_section(text, ["EXPERIENCE", "Experience", "WORK EXPERIENCE", "EMPLOYMENT"]) | |
| if not experience_section: | |
| return [] | |
| # Simple implementation | |
| experiences = [] | |
| lines = experience_section.split('\n') | |
| current_experience = None | |
| for line in lines: | |
| if not line.strip(): | |
| continue | |
| if re.search(r'\b(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{4}\b', line): | |
| if current_experience: | |
| experiences.append(current_experience) | |
| # Very simplistic parsing | |
| company_match = re.search(r'([A-Za-z0-9\s]+)', line) | |
| company = company_match.group(1).strip() if company_match else "Unknown Company" | |
| position_match = re.search(r'([A-Za-z\s]+)', line) | |
| position = position_match.group(1).strip() if position_match else "Unknown Position" | |
| current_experience = WorkExperience( | |
| company=company, | |
| position=position | |
| ) | |
| if current_experience: | |
| experiences.append(current_experience) | |
| return experiences | |
| def _extract_skills(self, text) -> List[str]: | |
| """Extract skills from resume text""" | |
| skills_section = self._extract_section(text, ["SKILLS", "Skills", "TECHNICAL SKILLS"]) | |
| if not skills_section: | |
| return [] | |
| # Simple split by commas and cleanup | |
| skills_text = skills_section.replace('\n', ' ') | |
| skills = [skill.strip() for skill in re.split(r'[,•]', skills_text) if skill.strip()] | |
| return skills | |
| def _extract_certifications(self, text) -> List[str]: | |
| """Extract certifications from resume text""" | |
| cert_section = self._extract_section(text, ["CERTIFICATIONS", "Certifications", "CERTIFICATES"]) | |
| if not cert_section: | |
| return [] | |
| # Simple split by newlines and cleanup | |
| certifications = [cert.strip() for cert in cert_section.split('\n') if cert.strip()] | |
| return certifications | |
| def _extract_section(self, text, section_headers) -> str: | |
| """Extract a section from the resume text based on headers""" | |
| lines = text.split('\n') | |
| section_text = "" | |
| in_section = False | |
| for i, line in enumerate(lines): | |
| # Check if this line contains a section header | |
| if any(header in line for header in section_headers): | |
| in_section = True | |
| continue | |
| # Check if we've reached the next section | |
| if in_section and i < len(lines) - 1: | |
| next_line = lines[i+1] | |
| if next_line.isupper() and len(next_line.strip()) > 0: | |
| break | |
| if in_section: | |
| section_text += line + "\n" | |
| return section_text.strip() |