Akash076 commited on
Commit
3e62707
·
verified ·
1 Parent(s): 08dd5c4

Upload pdf_parser.py

Browse files
Files changed (1) hide show
  1. Src/pdf_parser.py +205 -0
Src/pdf_parser.py ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fitz
2
+ from PIL import Image
3
+ import pytesseract
4
+ import re
5
+ import io
6
+ import json
7
+
8
+ def extract_text_from_pdf(file_path):
9
+ text = ""
10
+ ocr_used = False
11
+ doc = fitz.open(file_path)
12
+ for page in doc:
13
+ page_text = page.get_text().strip()
14
+ if page_text:
15
+ text += page_text + "\n"
16
+ else:
17
+ ocr_used = True
18
+ pix = page.get_pixmap(dpi=300)
19
+ img = Image.open(io.BytesIO(pix.tobytes()))
20
+ ocr_text = pytesseract.image_to_string(img)
21
+ text += ocr_text + "\n"
22
+ return text, ocr_used
23
+
24
+ def split_sections(text):
25
+ lines = [line.strip() for line in text.splitlines()]
26
+ section_headers = {
27
+ 'experience': ['experience', 'work experience', 'professional experience'],
28
+ 'education': ['education', 'academic qualifications', 'qualifications'],
29
+ 'skills': ['skills', 'technical skills', 'key skills', 'core competencies'],
30
+ 'certifications': ['certifications', 'certification', 'achievements'],
31
+ 'projects': ['projects', 'project experience', 'personal projects', 'project']
32
+ }
33
+ sections = {key: "" for key in section_headers}
34
+ current_section = None
35
+
36
+ for line in lines:
37
+ if not line:
38
+ continue
39
+ lower_line = line.lower()
40
+ found_header = False
41
+ for sec, headers in section_headers.items():
42
+ for header in headers:
43
+ header = header.lower()
44
+ if (lower_line.startswith(header) or
45
+ lower_line.endswith(header) or
46
+ header in lower_line):
47
+ current_section = sec
48
+ found_header = True
49
+ break
50
+ if found_header:
51
+ break
52
+ if found_header:
53
+ continue
54
+ if current_section:
55
+ sections[current_section] += line + "\n"
56
+ return sections
57
+
58
+ def parse_skills(section_text, ocr_used=False):
59
+ if not section_text.strip():
60
+ return None, 0.0
61
+
62
+ # Try comma/pipe separated format
63
+ if re.search(r"[,|]", section_text):
64
+ skills = re.split(r"\s*[,|]\s*", section_text)
65
+ cleaned = [s.strip() for s in skills if s.strip()]
66
+ if cleaned:
67
+ return cleaned, 1.0
68
+
69
+ # Try line break separated format
70
+ lines = [line.strip() for line in section_text.splitlines() if line.strip()]
71
+ if lines:
72
+ return lines, 1.0
73
+
74
+ # Fallback to skills.json lookup
75
+ try:
76
+ with open("skills.json", "r") as f:
77
+ skills_list = json.load(f)
78
+ except FileNotFoundError:
79
+ skills_list = []
80
+
81
+ found_skills = []
82
+ text_lower = section_text.lower()
83
+ for skill in skills_list:
84
+ if re.search(rf"\b{re.escape(skill.lower())}\b", text_lower):
85
+ found_skills.append(skill)
86
+
87
+ return found_skills or None, 0.8 if found_skills else 0.0
88
+
89
+ def parse_experience(section_text, ocr_used=False):
90
+ if not section_text.strip():
91
+ return None, 0.0
92
+ lines = [line for line in section_text.splitlines() if line.strip()]
93
+ exp_lines = []
94
+ for line in lines:
95
+ if re.search(r"\b(project|skill)\b", line, re.IGNORECASE):
96
+ continue
97
+ exp_lines.append(line)
98
+ if not exp_lines:
99
+ return None, 0.0
100
+ value = "\n".join(exp_lines).strip()
101
+ confidence = 0.9 if ocr_used else 1.0
102
+ return value, confidence
103
+
104
+ def parse_education(section_text, ocr_used=False):
105
+ if not section_text.strip():
106
+ return None, 0.0
107
+ lines = [line for line in section_text.splitlines() if line.strip()]
108
+ value = "\n".join(lines).strip()
109
+ confidence = 0.9 if ocr_used else 1.0
110
+ return value, confidence
111
+
112
+ def parse_certifications(section_text, ocr_used=False):
113
+ if not section_text.strip():
114
+ return None, 0.0
115
+ lines = [line for line in section_text.splitlines() if line.strip()]
116
+ value = "\n".join(lines).strip()
117
+ confidence = 0.9 if ocr_used else 1.0
118
+ return value, confidence
119
+
120
+ def parse_projects(section_text, ocr_used=False):
121
+ if not section_text.strip():
122
+ return None, 0.0
123
+ lines = [line for line in section_text.splitlines() if line.strip()]
124
+ projects = []
125
+ current_proj = {"title": "", "description": ""}
126
+ for line in lines:
127
+ if re.match(r'(.*\d{4}.*|.*present.*|.*github.*)', line, re.IGNORECASE):
128
+ if current_proj["title"]:
129
+ projects.append(current_proj)
130
+ current_proj = {"title": line.strip(), "description": ""}
131
+ else:
132
+ current_proj["description"] += line + " "
133
+ if current_proj["title"]:
134
+ projects.append(current_proj)
135
+ result = []
136
+ for proj in projects:
137
+ title = proj["title"]
138
+ desc = proj["description"].strip()
139
+ entry = f"{title}: {desc}" if desc else title
140
+ result.append(entry)
141
+ return "\n".join(result) if result else None, 0.9 if ocr_used else 1.0
142
+
143
+ def parse_header_fields(text):
144
+ lines = [line.strip() for line in text.splitlines()]
145
+ header_idx = len(lines)
146
+ section_keywords = ["objective", "summary", "experience", "education",
147
+ "project", "skill", "certification", "interests"]
148
+
149
+ # Find first section header
150
+ for i, line in enumerate(lines):
151
+ if any(kw in line.lower() for kw in section_keywords):
152
+ header_idx = i
153
+ break
154
+
155
+ # Fixed regex pattern with proper parenthesis
156
+ name = ""
157
+ for line in lines[:min(header_idx, 8)]: # Check first 8 lines before sections
158
+ if not line:
159
+ continue
160
+
161
+ # Improved regex pattern
162
+ if re.match(r'^[A-Z][a-z]+(?:\s+[A-Z][a-z]*)*$', line): # Fixed pattern
163
+ name = line
164
+ break
165
+
166
+ # Fallback for ALL-CAPS names
167
+ if re.match(r'^[A-Z\s]{3,}$', line) and len(line.split()) >= 2:
168
+ name = line.title()
169
+ break
170
+
171
+ # Rest of the contact info parsing remains the same
172
+ email_match = re.search(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text)
173
+ phone_match = re.search(r'(?:\+?(\d{1,3}))?[-. (]*(\d{3})[-. )]*(\d{3})[-. ]*(\d{4})', text)
174
+ linkedin_match = re.search(r'(https?://)?(www\.)?linkedin\.com/in/[\w-]+', text)
175
+
176
+ return {
177
+ "name": {"value": name or None, "confidence": 0.99 if name else 0.0},
178
+ "email": {"value": email_match.group(0) if email_match else None, "confidence": 0.99 if email_match else 0.0},
179
+ "phone": {"value": phone_match.group(0) if phone_match else None, "confidence": 0.99 if phone_match else 0.0},
180
+ "linkedin": {"value": linkedin_match.group(0) if linkedin_match else None, "confidence": 0.99 if linkedin_match else 0.0},
181
+ }
182
+
183
+ def parse_resume(file_path):
184
+ text, ocr_used = extract_text_from_pdf(file_path)
185
+ sections = split_sections(text)
186
+ header_data = parse_header_fields(text)
187
+
188
+ # Parse all sections
189
+ exp_val, exp_conf = parse_experience(sections.get('experience', ''), ocr_used)
190
+ edu_val, edu_conf = parse_education(sections.get('education', ''), ocr_used)
191
+ skills_val, skills_conf = parse_skills(sections.get('skills', ''), ocr_used)
192
+ proj_val, proj_conf = parse_projects(sections.get('projects', ''), ocr_used)
193
+ cert_val, cert_conf = parse_certifications(sections.get('certifications', ''), ocr_used)
194
+
195
+ # Combine results
196
+ result = {
197
+ **header_data,
198
+ "skills": {"value": skills_val, "confidence": skills_conf},
199
+ "experience": {"value": exp_val, "confidence": exp_conf},
200
+ "education": {"value": edu_val, "confidence": edu_conf},
201
+ "projects": {"value": proj_val, "confidence": proj_conf},
202
+ "certifications": {"value": cert_val, "confidence": cert_conf},
203
+ }
204
+
205
+ return result