File size: 8,567 Bytes
cb0c53a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
import PyPDF2
import docx
import io
import re
from pydantic import BaseModel
from typing import List, Optional, Dict

class Education(BaseModel):
    institution: str
    degree: str
    field_of_study: Optional[str] = None
    graduation_date: Optional[str] = None

class WorkExperience(BaseModel):
    company: str
    position: str
    start_date: Optional[str] = None
    end_date: Optional[str] = None
    description: Optional[str] = None

class PersonalInfo(BaseModel):
    name: Optional[str] = None
    email: Optional[str] = None
    phone: Optional[str] = None
    location: Optional[str] = None
    github: Optional[str] = None
    linkedin: Optional[str] = None
    portfolio: Optional[str] = None

class ResumeData(BaseModel):
    personal_info: PersonalInfo
    education: List[Education]
    work_experience: List[WorkExperience]
    skills: List[str]
    certifications: List[str]
    raw_text: str

class ResumeParser:
    def __init__(self):
        self.email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
        self.phone_pattern = r'(\+\d{1,3}[\s-]?)?\(?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}'
        self.github_pattern = r'github\.com/([A-Za-z0-9_-]+)'
        self.linkedin_pattern = r'linkedin\.com/in/([A-Za-z0-9_-]+)'

    def parse(self, uploaded_file) -> ResumeData:
        """Parse the uploaded resume file and extract key information"""
        # Extract text from file
        file_extension = uploaded_file.name.split('.')[-1].lower()
        
        if file_extension == 'pdf':
            raw_text = self._extract_text_from_pdf(uploaded_file)
        elif file_extension in ['docx', 'doc']:
            raw_text = self._extract_text_from_docx(uploaded_file)
        else:
            raise ValueError("Unsupported file format. Please upload a PDF or DOCX file.")
        
        # Extract components
        personal_info = self._extract_personal_info(raw_text)
        education = self._extract_education(raw_text)
        work_experience = self._extract_work_experience(raw_text)
        skills = self._extract_skills(raw_text)
        certifications = self._extract_certifications(raw_text)
        
        # Create and return ResumeData
        resume_data = ResumeData(
            personal_info=personal_info,
            education=education,
            work_experience=work_experience,
            skills=skills,
            certifications=certifications,
            raw_text=raw_text
        )
        
        return resume_data
    
    def _extract_text_from_pdf(self, file) -> str:
        """Extract text from PDF file"""
        pdf_reader = PyPDF2.PdfReader(io.BytesIO(file.getvalue()))
        text = ""
        for page in pdf_reader.pages:
            text += page.extract_text()
        return text
    
    def _extract_text_from_docx(self, file) -> str:
        """Extract text from DOCX file"""
        doc = docx.Document(io.BytesIO(file.getvalue()))
        text = ""
        for paragraph in doc.paragraphs:
            text += paragraph.text + "\n"
        return text
    
    def _extract_personal_info(self, text) -> PersonalInfo:
        """Extract personal information from resume text"""
        # Basic extraction with regex
        email = re.search(self.email_pattern, text)
        phone = re.search(self.phone_pattern, text)
        github = re.search(self.github_pattern, text)
        linkedin = re.search(self.linkedin_pattern, text)
        
        # First line often contains the name
        lines = text.split('\n')
        name = lines[0].strip() if lines else None
        
        return PersonalInfo(
            name=name,
            email=email.group(0) if email else None,
            phone=phone.group(0) if phone else None,
            github=github.group(1) if github else None,
            linkedin=linkedin.group(1) if linkedin else None
        )
    
    def _extract_education(self, text) -> List[Education]:
        """Extract education information from resume text"""
        # Simple implementation - in a real system this would be more sophisticated
        education_section = self._extract_section(text, ["EDUCATION", "Education", "ACADEMIC BACKGROUND"])
        if not education_section:
            return []
        
        # Very basic parsing - a real implementation would use more sophisticated NLP
        educations = []
        lines = education_section.split('\n')
        current_education = None
        
        for line in lines:
            if not line.strip():
                continue
                
            if any(degree in line for degree in ["Bachelor", "Master", "PhD", "B.S.", "M.S.", "Ph.D"]):
                if current_education:
                    educations.append(current_education)
                
                parts = line.split(',')
                degree = parts[0].strip() if parts else line.strip()
                institution = parts[1].strip() if len(parts) > 1 else ""
                
                current_education = Education(
                    institution=institution,
                    degree=degree
                )
        
        if current_education:
            educations.append(current_education)
            
        return educations
    
    def _extract_work_experience(self, text) -> List[WorkExperience]:
        """Extract work experience from resume text"""
        experience_section = self._extract_section(text, ["EXPERIENCE", "Experience", "WORK EXPERIENCE", "EMPLOYMENT"])
        if not experience_section:
            return []
        
        # Simple implementation
        experiences = []
        lines = experience_section.split('\n')
        current_experience = None
        
        for line in lines:
            if not line.strip():
                continue
                
            if re.search(r'\b(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{4}\b', line):
                if current_experience:
                    experiences.append(current_experience)
                
                # Very simplistic parsing
                company_match = re.search(r'([A-Za-z0-9\s]+)', line)
                company = company_match.group(1).strip() if company_match else "Unknown Company"
                
                position_match = re.search(r'([A-Za-z\s]+)', line)
                position = position_match.group(1).strip() if position_match else "Unknown Position"
                
                current_experience = WorkExperience(
                    company=company,
                    position=position
                )
        
        if current_experience:
            experiences.append(current_experience)
            
        return experiences
    
    def _extract_skills(self, text) -> List[str]:
        """Extract skills from resume text"""
        skills_section = self._extract_section(text, ["SKILLS", "Skills", "TECHNICAL SKILLS"])
        if not skills_section:
            return []
        
        # Simple split by commas and cleanup
        skills_text = skills_section.replace('\n', ' ')
        skills = [skill.strip() for skill in re.split(r'[,•]', skills_text) if skill.strip()]
        
        return skills
    
    def _extract_certifications(self, text) -> List[str]:
        """Extract certifications from resume text"""
        cert_section = self._extract_section(text, ["CERTIFICATIONS", "Certifications", "CERTIFICATES"])
        if not cert_section:
            return []
        
        # Simple split by newlines and cleanup
        certifications = [cert.strip() for cert in cert_section.split('\n') if cert.strip()]
        
        return certifications
    
    def _extract_section(self, text, section_headers) -> str:
        """Extract a section from the resume text based on headers"""
        lines = text.split('\n')
        section_text = ""
        in_section = False
        
        for i, line in enumerate(lines):
            # Check if this line contains a section header
            if any(header in line for header in section_headers):
                in_section = True
                continue
            
            # Check if we've reached the next section
            if in_section and i < len(lines) - 1:
                next_line = lines[i+1]
                if next_line.isupper() and len(next_line.strip()) > 0:
                    break
            
            if in_section:
                section_text += line + "\n"
        
        return section_text.strip()