Spaces:
Runtime error
Runtime error
Update utility/utils.py
Browse files- utility/utils.py +51 -7
utility/utils.py
CHANGED
|
@@ -402,7 +402,34 @@ def process_extracted_text(extracted_text):
|
|
| 402 |
|
| 403 |
# Process the model output for parsed result
|
| 404 |
def process_resume_data(LLMdata,cont_data,extracted_text):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 405 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 406 |
# Initialize the processed data dictionary
|
| 407 |
processed_data = {
|
| 408 |
"name": [],
|
|
@@ -415,15 +442,32 @@ def process_resume_data(LLMdata,cont_data,extracted_text):
|
|
| 415 |
"extracted_text": extracted_text
|
| 416 |
}
|
| 417 |
#LLM
|
|
|
|
| 418 |
processed_data['name'].extend(LLMdata.get('Name', []))
|
| 419 |
-
processed_data['contact_number'].extend(LLMdata.get('Contact', []))
|
| 420 |
processed_data['Designation'].extend(LLMdata.get('Designation', []))
|
| 421 |
-
processed_data['email'].extend(LLMdata.get("Email", []))
|
| 422 |
processed_data['Location'].extend(LLMdata.get('Address', []))
|
| 423 |
-
processed_data['Link'].extend(LLMdata.get('Link', []))
|
| 424 |
processed_data['Company'].extend(LLMdata.get('Company', []))
|
|
|
|
| 425 |
#Contact
|
| 426 |
-
processed_data['email'].extend(cont_data.get("emails", []))
|
| 427 |
-
processed_data['contact_number'].extend(cont_data.get("phone_numbers", []))
|
| 428 |
-
processed_data['Link'].extend(cont_data.get("links_RE", []))
|
| 429 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 402 |
|
| 403 |
# Process the model output for parsed result
|
| 404 |
def process_resume_data(LLMdata,cont_data,extracted_text):
|
| 405 |
+
|
| 406 |
+
# Removing duplicate emails
|
| 407 |
+
unique_emails = []
|
| 408 |
+
for email in cont_data['emails']:
|
| 409 |
+
if not any(email.lower() == existing_email.lower() for existing_email in LLMdata['Email']):
|
| 410 |
+
unique_emails.append(email)
|
| 411 |
+
|
| 412 |
+
# Removing duplicate links (case insensitive)
|
| 413 |
+
unique_links = []
|
| 414 |
+
for link in cont_data['links_RE']:
|
| 415 |
+
if not any(link.lower() == existing_link.lower() for existing_link in LLMdata['Link']):
|
| 416 |
+
unique_links.append(link)
|
| 417 |
|
| 418 |
+
# Removing duplicate phone numbers
|
| 419 |
+
normalized_contact = [num[-10:] for num in LLMdata['Contact']]
|
| 420 |
+
unique_numbers = []
|
| 421 |
+
for num in cont_data['phone_numbers']:
|
| 422 |
+
if num[-10:] not in normalized_contact:
|
| 423 |
+
unique_numbers.append(num)
|
| 424 |
+
|
| 425 |
+
# Add unique emails, links, and phone numbers to the original LLMdata
|
| 426 |
+
LLMdata['Email'] += unique_emails
|
| 427 |
+
LLMdata['Link'] += unique_links
|
| 428 |
+
LLMdata['Contact'] += unique_numbers
|
| 429 |
+
|
| 430 |
+
# Apply the function to the data
|
| 431 |
+
LLMdata=remove_duplicates_case_insensitive(LLMdata)
|
| 432 |
+
|
| 433 |
# Initialize the processed data dictionary
|
| 434 |
processed_data = {
|
| 435 |
"name": [],
|
|
|
|
| 442 |
"extracted_text": extracted_text
|
| 443 |
}
|
| 444 |
#LLM
|
| 445 |
+
|
| 446 |
processed_data['name'].extend(LLMdata.get('Name', []))
|
| 447 |
+
#processed_data['contact_number'].extend(LLMdata.get('Contact', []))
|
| 448 |
processed_data['Designation'].extend(LLMdata.get('Designation', []))
|
| 449 |
+
#processed_data['email'].extend(LLMdata.get("Email", []))
|
| 450 |
processed_data['Location'].extend(LLMdata.get('Address', []))
|
| 451 |
+
#processed_data['Link'].extend(LLMdata.get('Link', []))
|
| 452 |
processed_data['Company'].extend(LLMdata.get('Company', []))
|
| 453 |
+
|
| 454 |
#Contact
|
| 455 |
+
#processed_data['email'].extend(cont_data.get("emails", []))
|
| 456 |
+
#processed_data['contact_number'].extend(cont_data.get("phone_numbers", []))
|
| 457 |
+
#processed_data['Link'].extend(cont_data.get("links_RE", []))
|
| 458 |
+
|
| 459 |
+
#New_merge_data
|
| 460 |
+
processed_data['email'].extend(LLMdata['Email'])
|
| 461 |
+
processed_data['contact_number'].extend(LLMdata['Contact'])
|
| 462 |
+
processed_data['Link'].extend(LLMdata['Link'])
|
| 463 |
+
|
| 464 |
+
#to remove not found fields
|
| 465 |
+
# List of keys to check for 'Not found'
|
| 466 |
+
keys_to_check = ["name", "contact_number", "Designation", "email", "Location", "Link", "Company"]
|
| 467 |
+
|
| 468 |
+
# Replace 'Not found' with an empty list for each key
|
| 469 |
+
for key in keys_to_check:
|
| 470 |
+
if processed_data[key] == ['Not found'] or processed_data[key] == ['not found'] or processed_data[key] == []:
|
| 471 |
+
del processed_data[key]
|
| 472 |
+
|
| 473 |
+
return processed_data
|