akshansh36 commited on
Commit
7250828
·
verified ·
1 Parent(s): b3cb287

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +439 -0
app.py ADDED
@@ -0,0 +1,439 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from pymongo import MongoClient
3
+ import fitz # PyMuPDF
4
+ import ast
5
+ import re
6
+ from groq import Groq
7
+ import concurrent.futures
8
+ import pandas as pd
9
+ import io
10
+ import json
11
+ import requests
12
+ DB_NAME = 'akshansh_db'
13
+
14
+ try:
15
+ client = MongoClient('mongodb+srv://akshansh:HzLqyintpUfmcC4D@dev001.4fkwn.mongodb.net/')
16
+ db = client[DB_NAME]
17
+ collection = db['parsed_resume_streamlit']
18
+ print("MongoDB connection established.")
19
+ except Exception as e:
20
+ print(f"Error connecting to MongoDB: {e}")
21
+
22
+ groq_api = "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"
23
+ llmsherpa_api_url = " http://65.2.175.211:5010/api/parseDocument?renderFormat=all&applyOcr=yes"
24
+
25
+
26
+ def sanitize_text(text):
27
+ # Escape single quotes and other potentially problematic characters
28
+ return text.replace("'", "\\'")
29
+
30
+ def process_using_llm(text):
31
+ try:
32
+ sanitized_text = sanitize_text(text)
33
+ client = Groq(api_key=groq_api)
34
+ prompt=f"""
35
+ 1. Given is the text content of a resume, please extract information from it and output the result in a dictionary format which is defined below along with the expected data structure, strictly adhere to the dictionary format given below, if any field is not present leave it empty.
36
+
37
+ Note: 1. Do not skip any information and do not add any information which is not present in the input content.
38
+ 2. In case of github urls, linkedin urls, email id, add only if the url is present else leave it empty.
39
+ 3. For the work experience only the latest work experience is required that is the one which is presntly being done or done at the last.
40
+ 4. In the format of extracted_content, do not give any other things, like comments or anything
41
+ Input: {sanitized_text}
42
+
43
+ Expected output format: "extracted_content: {{
44
+ 'name': 'String',
45
+ 'email': 'String',
46
+ 'phone': 'String',
47
+ 'location': 'String',
48
+ 'linkedin': 'String',
49
+ 'github':'String',
50
+ 'inter_personal_skills': [
51
+ 'String'
52
+ ],
53
+ 'technical_skills': [
54
+ 'String'
55
+ ],
56
+ 'soft_skills':[
57
+ 'String'
58
+ ],
59
+ 'programming_languages':[
60
+ 'String'
61
+ ],
62
+ 'linguistic_languages':[
63
+ 'String'
64
+ ],
65
+ 'latest_work_experience':{{
66
+ 'company': 'String',
67
+ 'role': 'String',
68
+ 'duration': 'String',
69
+ 'work_location': 'String',
70
+ }},
71
+ 'graduation_details':{{
72
+ 'course':'String',
73
+ 'institution':'String',
74
+ 'course_type':'String',
75
+ 'year_of_graduation':'String',
76
+ 'percentage_or_cgpa':'String'
77
+ }},
78
+
79
+ 'higher_secondary_education':{{
80
+ 'institution':'String',
81
+ 'education_board_type':'String',
82
+ 'year_of_completion':'String',
83
+ 'percentage_or_cgpa':'String'
84
+ }},
85
+ 'secondary_education':{{
86
+ 'institution':'String',
87
+ 'education_board_type':'String',
88
+ 'year_of_completion':'String',
89
+ 'percentage_or_cgpa':'String'
90
+ }}
91
+
92
+ }}"
93
+
94
+ """
95
+ chat_completion = client.chat.completions.create(
96
+ messages=[
97
+ {
98
+ "role": "user",
99
+ "content": prompt
100
+ }
101
+ ],
102
+ model="llama3-70b-8192"
103
+ )
104
+
105
+ return chat_completion.choices[0].message.content
106
+ except Exception as e:
107
+ print(f"An error occurred in LLM part: {e}")
108
+ return None
109
+
110
+ def extract(output):
111
+ match = re.search(r'extracted_content:\s*(\{.*\})', output, re.DOTALL)
112
+ if match:
113
+ extracted_content = match.group(1)
114
+ return ast.literal_eval(extracted_content)
115
+ else:
116
+ print("No extracted content found in parsing llm's output")
117
+ return {}
118
+
119
+ def process_resume(pdf_content):
120
+ response = requests.post(llmsherpa_api_url, files={'file': ('Dhyey Dharmesh Pujara resume.pdf', pdf_content, 'application/pdf')})
121
+
122
+ # Check if the response is valid JSON
123
+ try:
124
+ response_json = response.json()
125
+ print(response_json)
126
+ except json.JSONDecodeError:
127
+ print("Failed to decode JSON response")
128
+ return None
129
+
130
+
131
+ if 'return_dict' in response_json and 'result' in response_json['return_dict']:
132
+ blocks = response_json['return_dict']['result']['blocks']
133
+
134
+
135
+ content=""
136
+ for block in blocks:
137
+ tag=block.get('tag',None)
138
+ if tag=="table":
139
+ table_rows=block['table_rows']
140
+ for row in table_rows:
141
+ cells=row.get('cells',None)
142
+ if cells:
143
+ cells=row['cells']
144
+ for cell in cells:
145
+ value=cell['cell_value']
146
+ if isinstance(value,dict):
147
+ sentences=value.get('sentences',None)
148
+ for sentence in sentences:
149
+ content+=sentence+'\n'
150
+
151
+ elif value !='':
152
+ content+=value+'\n'
153
+
154
+ else:
155
+ value=row.get('cell_value',None)
156
+ if value:
157
+ content+=value+'\n'
158
+
159
+
160
+ else:
161
+ sentences=block.get('sentences')
162
+ for s in sentences:
163
+ content+=s+'\n'
164
+
165
+
166
+ if content:
167
+
168
+ result = {}
169
+
170
+ processed_text = process_using_llm(content)
171
+ if processed_text:
172
+ extracted_output = extract(processed_text)
173
+ result=extracted_output
174
+ return result
175
+
176
+
177
+
178
+
179
+
180
+
181
+
182
+
183
+
184
+
185
+ def json_to_excel(data): # data is a list of JSON
186
+
187
+ try:
188
+ # Define the specific order of columns
189
+ column_order = [
190
+ 'Name', 'Phone', 'Location', 'Email', 'Linkedin', 'Github',
191
+ 'Graduation Details', 'Graduation Institution', 'Graduation Course Type',
192
+ 'Year of Graduation', 'Aggregate Percentage in Graduation',
193
+ 'Higher Secondary Institute Name', 'Higher Secondary Education Board Type',
194
+ 'Year of Completion of Higher Secondary Education',
195
+ 'Aggregate Percentage in Higher Secondary Education',
196
+ 'Secondary Education Institute Name', 'Secondary Education Board Type',
197
+ 'Year of Completion of Secondary Education', 'Aggregate Percentage in Secondary Education',
198
+ 'Current Working Organization', 'Current Designation', 'Current Work Duration',
199
+ 'Current Work Location', 'Inter Personal Skills', 'Technical Skills',
200
+ 'Soft Skills', 'Programming Languages', 'Languages'
201
+ ]
202
+
203
+ flat_data = []
204
+ for item in data:
205
+ flat_item = {}
206
+
207
+ if "name" in item:
208
+ name = item.get("name", None)
209
+ if name:
210
+ flat_item['Name'] = name
211
+
212
+ if "phone" in item:
213
+ phone = item.get('phone', None)
214
+ if phone:
215
+ flat_item['Phone'] = phone
216
+
217
+ if "location" in item:
218
+ location = item.get("location", None)
219
+ if location:
220
+ flat_item['Location'] = location
221
+
222
+ if "email" in item:
223
+ email = item.get("email", None)
224
+ if email:
225
+ flat_item['Email'] = email
226
+
227
+ if "linkedin" in item:
228
+ linkedin = item.get('linkedin', None)
229
+ if linkedin:
230
+ flat_item['Linkedin'] = linkedin
231
+
232
+ if 'github' in item:
233
+ github = item.get('github', None)
234
+ if github:
235
+ flat_item['Github'] = github
236
+
237
+ if "graduation_details" in item:
238
+ ed = item["graduation_details"]
239
+ course = ed.get('course', None)
240
+ if course:
241
+ flat_item['Graduation Details'] = course
242
+
243
+ institution = ed.get('institution', None)
244
+ if institution:
245
+ flat_item['Graduation Institution'] = institution
246
+
247
+ course_type = ed.get('course_type', None)
248
+ if course_type:
249
+ flat_item['Graduation Course Type'] = course_type
250
+
251
+ year = ed.get('year_of_graduation', None)
252
+ if year:
253
+ flat_item['Year of Graduation'] = year
254
+
255
+ marks = ed.get('percentage_or_cgpa', None)
256
+ if marks:
257
+ flat_item['Aggregate Percentage in Graduation'] = marks
258
+
259
+ if "higher_secondary_education" in item:
260
+ ed = item.get('higher_secondary_education')
261
+ institution = ed.get('institution', None)
262
+ if institution:
263
+ flat_item['Higher Secondary Institute Name'] = institution
264
+
265
+ board = ed.get('education_board_type', None)
266
+ if board:
267
+ flat_item['Higher Secondary Education Board Type'] = board
268
+
269
+ year = ed.get('year_of_completion', None)
270
+ if year:
271
+ flat_item['Year of Completion of Higher Secondary Education'] = year
272
+
273
+ marks = ed.get('percentage_or_cgpa', None)
274
+ if marks:
275
+ flat_item['Aggregate Percentage in Higher Secondary Education'] = marks
276
+
277
+ if "secondary_education" in item:
278
+ ed = item.get('secondary_education')
279
+ institution = ed.get('institution', None)
280
+ if institution:
281
+ flat_item['Secondary Education Institute Name'] = institution
282
+
283
+ board = ed.get('education_board_type', None)
284
+ if board:
285
+ flat_item['Secondary Education Board Type'] = board
286
+
287
+ year = ed.get('year_of_completion', None)
288
+ if year:
289
+ flat_item['Year of Completion of Secondary Education'] = year
290
+
291
+ marks = ed.get('percentage_or_cgpa', None)
292
+ if marks:
293
+ flat_item['Aggregate Percentage in Secondary Education'] = marks
294
+
295
+ if 'latest_work_experience' in item:
296
+ current_work = item.get('latest_work_experience', None)
297
+ if current_work:
298
+ company = current_work.get('company', None)
299
+ if company:
300
+ flat_item['Current Working Organization'] = company
301
+
302
+ role = current_work.get('role', None)
303
+ if role:
304
+ flat_item['Current Designation'] = role
305
+
306
+ duration = current_work.get('duration', None)
307
+ if duration:
308
+ flat_item['Current Work Duration'] = duration
309
+
310
+ location = current_work.get('work_location', None)
311
+ if location:
312
+ flat_item['Current Work Location'] = location
313
+
314
+ if "inter_personal_skills" in item:
315
+ flat_item["Inter Personal Skills"] = ", ".join(item["inter_personal_skills"])
316
+
317
+ if "technical_skills" in item:
318
+ flat_item["Technical Skills"] = ", ".join(item["technical_skills"])
319
+
320
+ if "soft_skills" in item:
321
+ flat_item["Soft Skills"] = ", ".join(item["soft_skills"])
322
+
323
+ if "programming_languages" in item:
324
+ flat_item["Programming Languages"] = ", ".join(item["programming_languages"])
325
+
326
+ if "linguistic_languages" in item:
327
+ flat_item["Languages"] = ", ".join(item["linguistic_languages"])
328
+
329
+ flat_data.append(flat_item)
330
+
331
+ # Create DataFrame
332
+ df = pd.DataFrame(flat_data)
333
+
334
+ # Reorder columns according to the specified order
335
+ df = df[[col for col in column_order if col in df.columns]]
336
+
337
+ return df
338
+
339
+ except Exception as e:
340
+ print(f"Error occurred in converting JSON to Excel: {e}")
341
+ return None
342
+
343
+
344
+
345
+
346
+
347
+
348
+
349
+ def main():
350
+ st.title('Resume Parser')
351
+
352
+ # Allow the user to specify the maximum number of resumes to upload
353
+ max_resumes = st.number_input("Maximum number of resumes to upload, limit: 5", min_value=1, max_value=5, value=1, step=1)
354
+
355
+ # Allow the user to upload the resumes
356
+ uploaded_files = st.file_uploader("Upload your resumes", type=["pdf"], accept_multiple_files=True)
357
+
358
+ if uploaded_files:
359
+ if len(uploaded_files) != max_resumes:
360
+ st.warning(f"Please upload exactly {max_resumes} resumes.")
361
+ else:
362
+ submit_button = st.button("Process Resumes")
363
+
364
+ if submit_button:
365
+ try:
366
+ with st.spinner("Your resumes are being processed..."):
367
+ with concurrent.futures.ThreadPoolExecutor() as executor:
368
+ # Reading the PDF content for each uploaded file
369
+ pdf_contents = [file.read() for file in uploaded_files[:max_resumes]]
370
+
371
+ # Process each PDF content using the process_resume function
372
+ results = list(executor.map(process_resume, pdf_contents))
373
+
374
+ successful_resumes = []
375
+ failed_resumes_count = 0
376
+ for result in results:
377
+ if result:
378
+ successful_resumes.append(result)
379
+ collection.insert_one(result)
380
+ else:
381
+ failed_resumes_count += 1
382
+
383
+ if successful_resumes:
384
+ st.success(f"Resumes processed successfully! {len(successful_resumes)} out of {max_resumes} resumes processed.")
385
+
386
+ if failed_resumes_count > 0:
387
+ st.warning(f"{failed_resumes_count} resumes could not be processed. Do you still want to download the successfully processed resumes?")
388
+ user_response = st.radio("Please select:", ("Yes", "No"))
389
+
390
+ if user_response == "Yes":
391
+ # Convert the processed resume data to a pandas DataFrame
392
+ df = json_to_excel(successful_resumes)
393
+ if df is not None:
394
+ # Create an Excel file in memory
395
+ excel_file = io.BytesIO()
396
+ with pd.ExcelWriter(excel_file, engine='xlsxwriter') as writer:
397
+ df.to_excel(writer, index=False, sheet_name='Resumes')
398
+
399
+ st.download_button(
400
+ label="Download XLSX file",
401
+ data=excel_file.getvalue(),
402
+ file_name="resume_data.xlsx",
403
+ mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
404
+ )
405
+
406
+ else:
407
+ st.error("Aw! Snap, could not process any of the resumes. Please try again later.")
408
+
409
+ elif user_response == "No":
410
+ st.info("Then try again after some time.")
411
+ else:
412
+ # Convert the processed resume data to a pandas DataFrame
413
+ df = json_to_excel(successful_resumes)
414
+ if df is not None:
415
+ # Create an Excel file in memory
416
+ excel_file = io.BytesIO()
417
+ with pd.ExcelWriter(excel_file, engine='xlsxwriter') as writer:
418
+ df.to_excel(writer, index=False, sheet_name='Resumes')
419
+
420
+ st.download_button(
421
+ label="Download XLSX file",
422
+ data=excel_file.getvalue(),
423
+ file_name="resume_data.xlsx",
424
+ mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
425
+ )
426
+
427
+
428
+
429
+ else:
430
+ st.error("Aw! Snap, could not process any of the resumes. Please try again later.")
431
+
432
+ else:
433
+ st.error("Aw! Snap, could not process any of the resumes. Please try again later.")
434
+ except Exception as e:
435
+ st.error("Aw! Snap, could not process your resumes. Please try again later.")
436
+ print(f"Error processing resumes: {e}")
437
+
438
+ if __name__ == "__main__":
439
+ main()