Spaces:
Runtime error
Runtime error
File size: 52,818 Bytes
227f173 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 |
# python file to parse different section from resume
from pdfminer.high_level import extract_pages, extract_text
from pdfminer.layout import LTTextContainer, LTChar, LTTextLineHorizontal
from collections import defaultdict
from flask import jsonify
import re, fitz, requests, logging, datetime
from .config import data_science_skills, keyword_variations, essential_skills, quality_mapping, Extract_sections, suggested_projects, ignore_rule_ids
from .config import required_sections, linkedin_domain, github_domain, basic_informations, section_headers, common_projects, ignore_error_keywords,blog_articles,youtube_links
from .config import kaggle_domain,hackerrank_domain,leetcode_domain,medium_domain
from spacy.matcher import Matcher
import language_tool_python
from collections import defaultdict
import random
tool = language_tool_python.LanguageTool('en-US')
class ResumeParser:
def extract_contact_number_from_resume(self, text):
contact_number = None
suggestion = ""
# Use regex pattern to find a potential contact number
pattern = r"\b(?:\+?\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b"
match = re.search(pattern, text)
if match:
contact_number = match.group()
# Check if the contact number is of the correct length
digits_only = re.sub(r'\D', '', contact_number)
if len(digits_only) == 10:
suggestion = ""
elif len(digits_only) > 10 and digits_only.startswith('91') and len(digits_only[2:]) == 10:
suggestion = ""
else:
suggestion = "Contact number should have exactly 10 digits."
return contact_number, suggestion
def extract_hyperlinks(self, pdf_path):
doc = fitz.open(pdf_path)
links = []
for page_num in range(len(doc)):
page = doc.load_page(page_num)
link_list = page.get_links()
for link in link_list:
uri = link.get('uri', None)
if uri:
links.append(uri)
return links
def extract_text_from_pdf(self, pdf_path):
return extract_text(pdf_path)
def extract_email_from_text(self, text):
pattern = r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b"
match = re.search(pattern, text)
if match:
return match.group()
return None
def extract_email_from_resume(self, pdf_path):
text = self.extract_text_from_pdf(pdf_path)
email = self.extract_email_from_text(text)
suggestion = ""
# If no email found in text, check hyperlinks
if not email:
links = self.extract_hyperlinks(pdf_path)
for link in links:
if link.startswith('mailto:'):
email_candidate = link.split('mailto:')[1]
if self.is_valid_email(email_candidate):
email = email_candidate
break
# Additional validation for email found in text or links
if email and not self.is_valid_email(email):
suggestion += "Your email address doesn't seem to be valid. Please check and correct."
return email, suggestion
def is_valid_email(self, email):
# Length check
if len(email) > 254:
return False
# Consecutive special characters check
if re.search(r"[._%+-]{2,}", email):
return False
# Domain part validation
domain_part = email.split('@')[1]
if not re.match(r"[A-Za-z0-9.-]+\.[A-Za-z]{2,}", domain_part):
return False
# Standard email format check
pattern = r"^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$"
return re.match(pattern, email) is not None
def extract_sections_from_resume(self, text):
missing_sections = []
sections_not_capitalized = []
for section in required_sections:
pattern = r"\b{}\b".format(re.escape(section))
match_obj = re.search(pattern, text, re.IGNORECASE)
if not match_obj:
missing_sections.append(section)
else:
if match_obj.group() not in map(str.upper, required_sections):
sections_not_capitalized.append(section)
return missing_sections, sections_not_capitalized
def extract_skills_from_resume(self, text):
if not isinstance(text, str):
raise ValueError(f"Expected 'text' to be a string, but got {type(text)}")
skills = []
for skill in essential_skills:
pattern = r"\b{}\b".format(re.escape(skill))
match = re.search(pattern, text, re.IGNORECASE)
if match:
skills.append(skill)
return skills
def extract_keyword_variations_from_resume(self, text):
found_keywords = []
for keyword, variations in keyword_variations.items():
for variation in variations:
if variation.lower() in text.lower():
found_keywords.append(variation)
break
return found_keywords
def extract_keyword_variations_from_formatted_text(self, formatted_text):
found_keyword_section = []
for keyword, variations in keyword_variations.items():
for variation in variations:
if variation.lower() in formatted_text.lower():
found_keyword_section.append(variation)
break
return found_keyword_section
def extract_linkedIn_urls_from_pdf(self, pdf_path):
linkedin_urls = None
pdf_document = fitz.open(pdf_path)
for page_num in range(len(pdf_document)):
page = pdf_document.load_page(page_num)
links = page.get_links()
for link in links:
url = link.get('uri', '')
if re.search(linkedin_domain, url):
linkedin_urls = url
pdf_document.close()
return linkedin_urls
def extract_github_urls_from_pdf(self, pdf_path):
github_urls = None
pdf_document = fitz.open(pdf_path)
for page_num in range(len(pdf_document)):
page = pdf_document.load_page(page_num)
links = page.get_links()
for link in links:
url = link.get('uri', '')
if re.search(github_domain, url):
path = re.sub(github_domain, '', url)
parts = path.split('/')
if len(parts) == 1:
github_urls = url
pdf_document.close()
return github_urls
def extract_extra_urls_pdf(self,pdf_path, domains):
extracted_urls = defaultdict(set)
try:
# Open the PDF document
pdf_document = fitz.open(pdf_path)
# Iterate through all pages in the PDF
for page_num in range(len(pdf_document)):
page = pdf_document.load_page(page_num)
links = page.get_links()
for link in links:
url = link.get('uri', '')
if url: # Ensure there's a URL
for domain in domains:
if re.search(domain, url, re.IGNORECASE):
extracted_urls[domain].add(url) # Add URL to the domain's set
except Exception as e:
print(f"Error processing PDF: {e}")
finally:
pdf_document.close()
return {domain: list(urls) for domain, urls in extracted_urls.items()}
def is_valid_url(self , github_urls ):
suggest = ""
for _ in [github_urls]:
if not github_urls:
break
try:
response = requests.head(github_urls)
if response.status_code != 200:
suggest = "GitHub URL is not valid, please check and correct. "
except requests.RequestException:
suggest = "GitHub URL is not valid, please check and correct. "
return suggest
return suggest
def is_valid_name(self, name):
if any(char.isdigit() for char in name):
return False
if len(name.split()) > 3:
return False
common_non_names = {"Email", "Github", "LinkedIn", "Portfolio", "Data Analyst"}
if name in common_non_names:
return False
return True
def extract_name(self, resume_text):
lines = resume_text.split('\n')
# Use regex to find lines that likely contain names
name_lines = [line for line in lines if re.match(r'^[A-Za-z]*\s[A-Za-z]*', line.strip())]
names = []
for i in range(len(name_lines)):
if self.is_valid_name(name_lines[i].strip()):
names.append(name_lines[i].strip())
if len(names) >= 1:
name = names[0]
suggestion = ""
# Check if the name parts contain only alphabetic characters
name_parts = name.split()
if any(part[0].islower() for part in name_parts):
suggestion += " name should start with a capital letter. "
return name, suggestion
return None, "No valid name found"
def check_missing_sections(self, resume_data):
missing_information = []
for section in basic_informations:
if not resume_data.get(section):
missing_information.append(section)
return missing_information
def segregate_sections(self, text):
header_pattern = re.compile(rf'^\s*({"|".join(re.escape(header) for header in section_headers)}):?\s*$', re.IGNORECASE)
sections_text = {}
current_section = None
lines = text.splitlines()
for line in lines:
clean_line = line.strip()
match = header_pattern.match(clean_line)
if match:
current_section = match.group(1).upper()
sections_text[current_section] = []
elif current_section:
sections_text[current_section].append(line.strip())
return sections_text
def extract_and_format_sections(self, sections_text, Extract_sections):
formatted_text = ""
for section in Extract_sections:
if section in sections_text:
section_content = " ".join(sections_text[section]).replace('\n', ' ')
formatted_text += f"{section}:\n{section_content}\n\n"
return formatted_text
def replace_keywords_with_placeholders(self, formatted_text, found_keyword_section):
placeholder_text = formatted_text
keyword_placeholders = {}
# Use a set to avoid duplicates and keep track of keyword placeholders
used_keywords = set()
for i, keyword in enumerate(found_keyword_section):
if keyword not in used_keywords:
used_keywords.add(keyword)
placeholder = f"{{KEYWORD_{i}}}"
keyword_placeholders[placeholder] = keyword
# Using word boundary to match whole words
placeholder_text = re.sub(r'\b' + re.escape(keyword) + r'\b', placeholder, placeholder_text, flags=re.IGNORECASE)
return placeholder_text, keyword_placeholders
def replace_placeholders_with_keywords(self, grammar_issues, keyword_placeholders):
updated_issues = []
for issue in grammar_issues:
context = issue['context']
for placeholder, keyword in keyword_placeholders.items():
context = context.replace(placeholder, keyword)
# Update the context in the issue dictionary
issue['context'] = context
updated_issues.append(issue)
return updated_issues
def grammar_check(self, placeholder_text):
matches = tool.check(placeholder_text)
grammar_issues = []
for match in matches:
issue = {
"context": match.context,
"error": match.message,
"rule_id": match.ruleId,
"suggested_correction": match.replacements
}
grammar_issues.append(issue)
return grammar_issues
def filter_grammar_issues(self, grammar_issues, ignore_rule_ids=None, ignore_error_keywords=None):
if ignore_rule_ids is None:
ignore_rule_ids = []
if ignore_error_keywords is None:
ignore_error_keywords = []
filtered_issues = []
for issue in grammar_issues:
if issue['rule_id'] not in ignore_rule_ids and not any(keyword in issue['error'] for keyword in ignore_error_keywords):
filtered_issues.append(issue)
return filtered_issues
def process_resume(self, text, found_keyword_section, Extract_sections):
sections_text = self.segregate_sections(text)
formatted_text = self.extract_and_format_sections(sections_text, Extract_sections)
found_keyword_section = self.extract_keyword_variations_from_formatted_text(formatted_text)
placeholder_text, keyword_placeholders = self.replace_keywords_with_placeholders(formatted_text, found_keyword_section)
grammar_issues = self.grammar_check(placeholder_text)
grammar_issues_text = self.replace_placeholders_with_keywords(grammar_issues, keyword_placeholders)
filtered_grammar_issues = self.filter_grammar_issues(grammar_issues, ignore_rule_ids, ignore_error_keywords)
return filtered_grammar_issues
def grammar_issue_check(self, text, found_keyword_section, Extract_sections):
issues = {}
text1 = " ".join(text.split("\n"))
for section in Extract_sections:
grammar_issues = self.process_resume(text, found_keyword_section, [section])
if not grammar_issues:
grammar_issues = "no error found"
issues[section] = grammar_issues
return issues
def normalize_font_name(self,font_name):
if '-' in font_name:
font_name = font_name.split('-')[0]
if '+' in font_name:
font_name = font_name.split('+')[1]
return font_name
def extract_text_properties(self, pdf_path, predefined_terms):
text_properties = []
current_phrase = ""
current_font_size = None
current_font_name = None
current_page_num = None
special_characters = set("ββͺβ’!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~")
def add_current_phrase():
nonlocal current_phrase
if current_phrase.strip():
flag = any(current_phrase in term for term in predefined_terms)
if not flag:
text_properties.append({
"text": current_phrase,
"font_size": current_font_size,
"font_name": current_font_name,
"page_num": current_page_num
})
current_phrase = ""
for page_layout in extract_pages(pdf_path):
for element in page_layout:
if isinstance(element, LTTextContainer):
for text_line in element:
if isinstance(text_line, LTTextLineHorizontal):
for character in text_line:
if isinstance(character, LTChar):
text = character.get_text()
font_size = round(character.size, 2)
font_name = self.normalize_font_name(character.fontname)
page_num = page_layout.pageid
if text.isspace() or text in special_characters:
add_current_phrase()
continue
if (font_size != current_font_size or font_name != current_font_name or
page_num != current_page_num):
add_current_phrase()
current_font_size = font_size
current_font_name = font_name
current_page_num = page_num
current_phrase += text
add_current_phrase()
return text_properties
def group_similar_fonts(self,text_properties, tolerance=0.5):
grouped_properties = defaultdict(list)
for prop in text_properties:
rounded_size = round(prop["font_size"] / tolerance) * tolerance
key = (prop["font_name"], rounded_size)
grouped_properties[key].append(prop)
return grouped_properties
def identify_different_fonts_and_sizes(self, grouped_properties):
most_common_group = max(grouped_properties.values(), key=len)
most_common_key = None
for key, group in grouped_properties.items():
if group == most_common_group:
most_common_key = key
break
different_texts = []
for key, group in grouped_properties.items():
if group != most_common_group:
for prop in group:
reason = []
if key[1] != most_common_key[1]:
reason.append(f"size not {most_common_key[1]}")
if key[0] != most_common_key[0]:
reason.append(f"font not {most_common_key[0]}")
different_texts.append({
"page_num": prop['page_num'],
"text": prop['text'],
"found_size": prop['font_size'],
"found_font_name": prop['font_name'],
"reason": ", ".join(reason)
})
return different_texts
def parse_dates(self, sections_text, section_name):
# Check if the section is in the text
suggest = ""
# Define the date patterns to match various date formats
date_pattern = (
r'\b\d{1,2}/\d{4}\b|' # MM/YYYY
r'\b(?:jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|jul(?:y)?|aug(?:ust)?|sep(?:tember)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?)\s+\d{4}\b|' # Month YYYY
r'\b(?:jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|jul(?:y)?|aug(?:ust)?|sep(?:tember)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?)\s+\d{1,2},?\s*\d{4}\b|' # Month DD, YYYY
r'\b\d{4}\b|' # YYYY
r'\b(?:jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|jul(?:y)?|aug(?:ust)?|sep(?:tember)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?)[a-z]*/?\d{4}\b|' # Month/YYYY
r'\b(?:jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|jul(?:y)?|aug(?:ust)?|sep(?:tember)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?)[a-z]*\d{4}\s*-\s*(?:jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|jul(?:y)?|aug(?:ust)?|sep(?:tember)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?)[a-z]*\d{4}\b' # Month/YYYY - Month/YYYY
)
all_dates = []
# Iterate over the entries in the section_name
for entry in sections_text[section_name]:
entry = entry.lower()
matches = re.findall(date_pattern, entry)
if matches and len(matches)>1:
if len(matches) == 2:
all_dates.append(f"{matches[0]} {matches[1]}")
else:
all_dates.extend(matches)
return all_dates
def convert_to_date(self, date_str):
# Mapping of month names and abbreviations to their numeric equivalents
month_map = {
'jan': 1, 'january': 1, 'feb': 2, 'february': 2,
'mar': 3, 'march': 3, 'apr': 4, 'april': 4,
'may': 5, 'jun': 6, 'june': 6, 'jul': 7,
'july': 7, 'aug': 8, 'august': 8, 'sep': 9,
'september': 9, 'oct': 10, 'october': 10,
'nov': 11, 'november': 11, 'dec': 12, 'december': 12,
'01': 1, '02': 2, '03': 3, '04': 4,
'05': 5, '06': 6, '07': 7, '08': 8,
'09': 9, '10': 10, '11': 11, '12': 12
}
# Regex patterns to match different date formats
pattern_mm_yyyy = re.compile(r'(\d{1,2})/(\d{4})')
pattern_mm_yyyy_space = re.compile(r'(\d{1,2})\s(\d{4})')
pattern_month_yyyy = re.compile(r'([a-zA-Z]+)\s?(\d{4})')
pattern_yyyy = re.compile(r'(\d{4})')
def extract_date(date_str):
match_mm_yyyy = pattern_mm_yyyy.match(date_str)
match_mm_yyyy_space = pattern_mm_yyyy_space.match(date_str)
match_month_yyyy = pattern_month_yyyy.match(date_str)
match_yyyy = pattern_yyyy.match(date_str)
if match_mm_yyyy:
month = int(match_mm_yyyy.group(1))
year = int(match_mm_yyyy.group(2))
elif match_mm_yyyy_space:
month = int(match_mm_yyyy_space.group(1))
year = int(match_mm_yyyy_space.group(2))
elif match_month_yyyy:
month = month_map.get(match_month_yyyy.group(1).lower())
year = int(match_month_yyyy.group(2))
elif match_yyyy:
month = 1
year = int(match_yyyy.group(1))
else:
return []
return datetime.date(year, month, 1)
date_parts = re.findall(r'(\d{4}\s[a-zA-Z]+\s?|\d{4}[a-zA-Z]+|\d{4}\/\d{2}|\d{4}\s\d{2}|[a-zA-Z]+\s?\d{4}|\d{4}\s[a-zA-Z]+)', date_str)
if len(date_parts) == 1:
# Standalone year or single date
start_date = extract_date(date_parts[0])
end_date = datetime.date(start_date.year, start_date.month, start_date.day)
elif len(date_parts) == 2:
# Date range
start_date = extract_date(date_parts[0])
end_date = extract_date(date_parts[1])
else:
return []
return start_date, end_date
def date_time(self, date_parts):
converted_dates = []
for date_part in date_parts:
start_date, end_date = self.convert_to_date(date_part)
converted_dates.append((start_date, end_date))
return converted_dates
def check_chronological_order(self, converted_dates, section_name ):
suggestion = ""
sorted_dates = sorted(converted_dates, key=lambda x: (x[1], x[0]), reverse=True)
if converted_dates == sorted_dates:
suggestion = f"{section_name} section is in chronological order."
else:
suggestion = f"{section_name} section is not in chronological order."
return suggestion
def check_common_projects(self, projects_text):
found_projects = []
for project in common_projects:
if project.lower() in projects_text.lower():
found_projects.append(project)
return found_projects
def recommend_resources():
# Randomly pick 2 blog articles and 2 YouTube links
recommended_blogs = random.sample(blog_articles, 2)
recommended_youtube = random.sample(youtube_links, 2)
# Return the recommendations
return {
"Recommended Blogs": recommended_blogs,
"Recommended YouTube Links": recommended_youtube
}
def check_imarticus_certifications(self, certifications_text):
# Check if "imarticus" is present in the certifications text
if "imarticus" in certifications_text.lower():
return {
"found": True,
"message": "Imarticus certification found. Please upload it in the academic section."
}
return {
"found": False,
"message": "No Imarticus certification found in the provided text."
}
def chronological_order_check(self, sections_text, section_name):
order_suggestion = ""
suggestion = ""
section_name = section_name.upper()
if section_name in sections_text:
date = self.parse_dates(sections_text, section_name)
if date:
converted_dates = self.date_time(date)
order_suggestion = self.check_chronological_order(converted_dates, section_name)
else:
suggestion = f"No valid dates found in {section_name} section. "
else:
suggestion = f"{section_name} is not in section header. "
return order_suggestion, suggestion
# Function to check for spelling mistakes
def check_spelling(self, headers, section_headers):
suggestions = []
for header in headers:
if header.upper() not in map(str.upper, section_headers):
suggestions = header
return suggestions
def is_present_name(name):
"""
Checks if a given name has at least 2 words.
Args:
name: The name string to check.
Returns:
True if it has at least 2 words, false otherwise.
"""
parts = name.split()
return len(parts) >= 2
def is_sentence_case(name):
parts = name.split() # Split into individual words
for part in parts:
if not part: # handles empty strings in name
continue
if not part[0].isupper() or not part[1:].islower():
return False # Check if first letter is uppercase and rest are lowercase
return True
def is_present_name(self,name):
parts = name.split()
return len(parts) >= 2
def is_sentence_case(self,name):
parts = name.split()
for part in parts:
if not part:
continue
if not part[0].isupper() or not part[1:].islower():
return False
return True
def extract_project_links(self,sections_text):
project_links = {}
if "PROJECTS" in sections_text:
project_list = sections_text.get("PROJECTS", [])
url_pattern = r"https?://[^\s]+"
for project in project_list:
links = re.findall(url_pattern,project)
if links:
project_links[project] = links
return project_links
def count_sentences(self,text):
sentence_endings = r"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|!)\s"
sentences = re.split(sentence_endings, text)
sentences = [s.strip() for s in sentences if s.strip()]
return len(sentences)
def calculate_summary_score(self,summary):
if not summary:
score+=0
num_sentences = self.count_sentences(summary)
if num_sentences<=4:
return 3
elif num_sentences>4:
return 1
else:
return 0
def calculate_extra_urls_bonus(self,pdf_path):
domains = [
r"hackerrank\.com", # Hackerrank
r"leetcode\.com", # LeetCode
r"medium\.com" # Medium
]
extra_urls = self.extract_extra_urls_pdf(pdf_path, domains)
has_extra_urls = any(urls for urls in extra_urls.values())
return 5 if has_extra_urls else 0
def calculate_relevant_experience_score(self, experience_text):
"""
Assigns a score based on the presence of relevant experience keywords.
Args:
experience_text (str): The extracted work experience section text.
Returns:
int: A score of 5 if relevant keywords are found, otherwise 0.
"""
if not experience_text:
return 0 # β
No experience section β Score 0
if isinstance(experience_text, list):
experience_text = " ".join(experience_text) # β
Convert list to a single string
experience_text = experience_text.strip().lower() # β
Ensure it's a string and lowercase
# β
Check if any keyword from 'data_science_skills' or 'essential_skills' exists
for skill in data_science_skills + essential_skills:
if skill.lower() in experience_text:
return 5 # β
Found relevant experience β Full score
return 0
def calculate_ds_skills_score(self, skills_present):
if not skills_present: # No skills found at all
return 0
# Use skills from config instead of hardcoded list
ds_skills_list_lower = [skill.lower() for skill in data_science_skills]
skills_present_lower = [skill.lower() for skill in skills_present]
matching_count = sum(1 for skill in skills_present_lower
if skill in ds_skills_list_lower)
if matching_count == 0: # Skills found but none match DS list
return 2
elif 1 <= matching_count <= 5:
return 3
elif matching_count > 5:
return 5
return 0
def calculate_project_link_score(self, projects_with_links):
"""
Assigns a score based on whether project links are present.
Args:
projects_with_links (int): The number of projects with links.
Returns:
int: 2 if project links are found, otherwise 0.
"""
return 2 if projects_with_links > 0 else 0
def imarticus_review_score(self,name,contact_number,email,linkedin_urls,github_url,missing_sections,sections_not_capitalized,common_projects,section_order_suggestion,sections_text,skills,relevant_experience_score):
score = 0
if name:
name_parts = name.split()
num_parts = len(name_parts)
if num_parts == 0:
score += 0
if self.is_sentence_case(name):
score += 3
elif self.is_present_name(name):
score += 1.5
if contact_number and isinstance(contact_number, str):
digits_only = re.sub(r'\D', '', contact_number)
if digits_only.startswith("91") and len(digits_only) > 10:
digits_only = digits_only[2:] # Remove the first two characters ('91')
if len(digits_only) == 10 and digits_only[0] in "6789": # Check for valid Indian mobile numbers
score += 3
if email:
score += 3 if self.is_valid_email(email) else 0
score += 3 if linkedin_urls else 0
if github_url:
github_suggestion = self.is_valid_url(github_url)
score += 3 if not github_suggestion else 0
else:
score += 0
if len(missing_sections)==0 and len(sections_not_capitalized)==0:
score+=10
elif len(missing_sections)==0 and len(sections_not_capitalized)>0:
score+=8
elif len(missing_sections)<=3:
score+=6
elif len(missing_sections)>4:
score+=3
if common_projects:
score +=0
else:
score +=5
if section_order_suggestion:
score -= 2
else:
score
"""
ds_skills_list_lower = [skill.lower() for skill in data_science_skills]
skills_present_lower = [skill.lower() for skill in self.extract_skills_from_resume(skills) ]
matching_skill_count = 0
for skill in skills_present_lower:
if ds_skills_list_lower:
matching_skill_count+=1
if matching_skill_count==0:
score+=0
if matching_skill_count<=5:
score+=2
elif matching_skill_count>=10 and matching_skill_count<=15:
score+5
else:
score+=8
"""
if "PROJECTS" not in sections_text:
score+=0
else:
project_list = sections_text.get("PROJECTS",[])
project_count = len([x for x in project_list if "Description" in x])
if project_count<=2:
score+=2
elif project_count>2 and project_count<=4:
score+=5
elif project_count>4:
score+=3
"""
project_links = self.extract_project_links(sections_text)
total_projects = len(sections_text.get("PROJECTS", []))
projects_with_links = len(project_links)
if total_projects > 0:
if projects_with_links == 0:
score+=0
elif projects_with_links / total_projects >= 0.5:
score += 1.5
if projects_with_links == total_projects:
score += 3
"""
resume_data = {}
# Extract projects & links
project_links = self.extract_project_links(sections_text)
projects_with_links = len(project_links)
# β
Count only projects with descriptions
valid_projects = [
p for p in sections_text.get("PROJECTS", []) if "description" in p.lower()
]
total_projects = len(valid_projects) # β
Count projects properly
# β
Calculate project link score
project_link_score = self.calculate_project_link_score(projects_with_links)
resume_data["project_link_score"] = project_link_score
# β
Prevent division by zero
if total_projects > 0:
if projects_with_links == 0:
score += 0
elif projects_with_links / total_projects >= 0.5:
score += 1.5
if projects_with_links == total_projects:
score += 3
else:
score += 0 # β
Ensure no division error if no projects exist
"""
profile_summary = sections_text.get("PROFILE SUMMARY", "")
print(profile_summary)
summary_score = self.calculate_summary_score(profile_summary)
score += summary_score
"""
ds_skills_score = self.calculate_ds_skills_score(skills)
score += ds_skills_score
certifications = sections_text.get("CERTIFICATIONS & ACADEMIC ENDEAVOURS", [])
num_certifications = len(certifications)
if num_certifications==0:
score+=0
elif 0 < num_certifications <= 2:
score+=3
elif 2 < num_certifications <= 4:
score+=5
elif num_certifications>4:
score+=7
"""
extra_urls_bonus = self.calculate_extra_urls_bonus(pdf_path)
score += extra_urls_bonus
"""
score += relevant_experience_score
score += project_link_score
return score
def imarticus_detailed_score(self, name, contact_number, email, linkedin_urls, github_url,
missing_sections=None, sections_not_capitalized=None, common_projects=None,
section_order_suggestion=None, sections_text=None, skills=None,
relevant_experience_score=0):
# Ensure lists and dictionaries have default values to avoid 'NoneType' errors
missing_sections = missing_sections or []
sections_not_capitalized = sections_not_capitalized or []
common_projects = common_projects or []
sections_text = sections_text or {}
score_breakdown = {
"name_score": 0,
"contact_number_score": 0,
"email_score": 0,
"linkedin_url_score": 0,
"github_url_score": 0,
"missing_sections_score": 0,
"common_projects_score": 0,
"section_order_score": 0,
"projects_score": 0,
"certifications_score": 0,
"relevant_experience_score": 0,
"ds_skills_score": 0,
"extra_urls_bonus": 0,
"summary_score": 0,
"project_link_score": 0
}
# β
Name Score (3 Points)
if name:
if self.is_sentence_case(name):
score_breakdown["name_score"] = 3
elif self.is_present_name(name):
score_breakdown["name_score"] = 1.5
# β
Contact Number Score (3 Points)
if contact_number and isinstance(contact_number, str):
digits_only = re.sub(r'\D', '', contact_number)
if digits_only.startswith("91") and len(digits_only) > 10:
digits_only = digits_only[2:]
if len(digits_only) == 10 and digits_only[0] in "6789":
score_breakdown["contact_number_score"] = 3
# β
Email Score (3 Points)
score_breakdown["email_score"] = 3 if email and self.is_valid_email(email) else 0
# β
LinkedIn URL Score (3 Points)
score_breakdown["linkedin_url_score"] = 3 if linkedin_urls else 0
# β
GitHub URL Score (3 Points)
if github_url and self.is_valid_url(github_url):
score_breakdown["github_url_score"] = 3
# β
Missing Sections Score (10 Points)
if not missing_sections and not sections_not_capitalized:
score_breakdown["missing_sections_score"] = 10
elif not missing_sections and sections_not_capitalized:
score_breakdown["missing_sections_score"] = 8
elif len(missing_sections) <= 3:
score_breakdown["missing_sections_score"] = 6
else:
score_breakdown["missing_sections_score"] = 3
# β
Common Projects Score (5 Points)
score_breakdown["common_projects_score"] = 0 if common_projects else 5
# β
Section Order Score (2 Points)
score_breakdown["section_order_score"] = -2 if section_order_suggestion else 0
# β
Projects Score (5 Points)
if "PROJECTS" in sections_text:
project_list = sections_text.get("PROJECTS", [])
project_count = len([x for x in project_list if "Description" in x])
if project_count <= 2:
score_breakdown["projects_score"] = 2
elif 2 < project_count <= 4:
score_breakdown["projects_score"] = 5
else:
score_breakdown["projects_score"] = 3
# β
Certifications Score (7 Points)
certifications = sections_text.get("CERTIFICATIONS & ACADEMIC ENDEAVOURS", [])
num_certifications = len(certifications)
if num_certifications == 0:
score_breakdown["certifications_score"] = 0
elif 0 < num_certifications <= 2:
score_breakdown["certifications_score"] = 3
elif 2 < num_certifications <= 4:
score_breakdown["certifications_score"] = 5
else:
score_breakdown["certifications_score"] = 7
# β
Relevant Experience Score (5 Points)
score_breakdown["relevant_experience_score"] = relevant_experience_score if relevant_experience_score is not None else 0
# β
Data Science Skills Score (5 Points)
score_breakdown["ds_skills_score"] = self.calculate_ds_skills_score(skills)
# β
Extra URLs Bonus (5 Points)
score_breakdown["extra_urls_bonus"] = self.calculate_extra_urls_bonus(sections_text)
# β
Summary Score (5 Points)
profile_summary = sections_text.get("PROFILE SUMMARY", "")
score_breakdown["summary_score"] = self.calculate_summary_score(profile_summary)
# β
Project Link Score (2 Points)
project_links = self.extract_project_links(sections_text)
projects_with_links = len(project_links)
score_breakdown["project_link_score"] = self.calculate_project_link_score(projects_with_links)
return score_breakdown
def calculate_name_score(self,name):
if not name:
return 0
name_parts = name.split()
num_parts = len(name_parts)
if num_parts == 0:
return 0
elif self.is_sentence_case(name):
return 3
elif self.is_present_name(name):
return 1.5
else:
return 0
def calculate_contact(self,contact_number):
if contact_number and isinstance(contact_number, str):
digits_only = re.sub(r'\D', '', contact_number)
if digits_only.startswith("91") and len(digits_only) > 10:
digits_only = digits_only[2:] # Remove the first two characters ('91')
if len(digits_only) == 10 and digits_only[0] in "6789": # Check for valid Indian mobile numbers
return 3
else:
return 0
def calculate_email(self,email):
if email:
if self.is_valid_email(email):
return 3
else:
return 0
def calculate_github_url_score(self,github_url):
if github_url:
github_suggestion = self.is_valid_url(github_url)
return 3 if not github_suggestion else 0
return 0
def parse_text(self, path):
logger = logging.getLogger(__name__)
logging.getLogger("pdfminer").setLevel(logging.WARNING)
resume_data = {}
logger.debug('parsing text')
text = self.extract_text_from_pdf(path)
text1 = " ".join(text.split("\n"))
skills_found = self.extract_skills_from_resume(text)
found_keywords = self.extract_keyword_variations_from_resume(text)
sections_text = self.segregate_sections(text)
formatted_text = self.extract_and_format_sections(sections_text, Extract_sections)
found_keyword_section = self.extract_keyword_variations_from_formatted_text(formatted_text)
parsed_sections = self.segregate_sections(text)
projects = parsed_sections.get("PROJECTS", [])
certifications = parsed_sections.get("CERTIFICATIONS & ACADEMIC ENDEAVOURS", [])
projects_text = "\n".join(projects)
certifications_text = "\n".join(certifications)
found_imarticus_certification = self.check_imarticus_certifications(certifications_text)
found_projects = self.check_common_projects(projects_text)
name, name_suggestion = self.extract_name(text)
contact_number, contact_suggestion = self.extract_contact_number_from_resume(text)
email, email_suggestion = self.extract_email_from_resume(path)
github_urls = self.extract_github_urls_from_pdf(path)
github_urls_suggestions = self.is_valid_url(github_urls)
linkedin_urls = self.extract_linkedIn_urls_from_pdf(path)
section_by_grammer_issues = self.grammar_issue_check(text, found_keyword_section, Extract_sections)
domains = [
r"hackerrank\.com", # Hackerrank
r"leetcode\.com", # LeetCode
r"medium\.com" # Medium
]
extra_urls = self.extract_extra_urls_pdf(path, domains)
education_order_suggestion, education_suggestion = self.chronological_order_check(sections_text, "ACADEMIC PROFILE")
experience_order_suggestion, experience_suggestion = self.chronological_order_check(sections_text, "WORK EXPERIENCE")
headers = list(sections_text.keys())
spelling_suggestions = self.check_spelling(headers, section_headers)
predefined_terms = [name, email]
predefined_terms.extend(required_sections)
text_properties = self.extract_text_properties(path, predefined_terms)
grouped_properties = self.group_similar_fonts(text_properties)
different_texts = self.identify_different_fonts_and_sizes(grouped_properties)
font_suggestions = []
for item in different_texts:
font_suggestion = f"Formatting issue at Page: {item['page_num']}, Text: {item['text']}, Reason: {item['reason']}, Found font size: {item['found_size']}, Found font name: {item['found_font_name']}"
font_suggestions.append(font_suggestion)
missing_sections, sections_not_capitalized = self.extract_sections_from_resume(text)
linkedin_urls_suggestion = str()
common_project = str()
if not name:
name_suggestion = "Please add name to the resume."
if not contact_number:
contact_suggestion = "Please add the contact number to the resume."
if not email:
email_suggestion = "Please add the email address to the resume."
if not github_urls:
github_urls_suggestions = "Add the github_urls to the resume."
if not linkedin_urls:
linkedin_urls_suggestion = "Add the linkedin_urls to the resume."
if found_projects:
common_project = "Common projects found in Projects section: "
for project in found_projects:
common_project += project
# Replace the existing project length suggestion code with:
project_list = sections_text.get("PROJECTS", [])
projects_with_description = [
p for p in project_list
if "description" in p.lower()
]
project_count = len(projects_with_description)
if project_count == 0:
project_length_suggestion = "No projects found. Consider at least 2 projects."
elif project_count == 1:
project_length_suggestion = "Only 1 project found. Consider adding 1 more project."
else:
project_length_suggestion = f"{project_count} projects found."
# Store in resume data (keeps your existing URL extraction)
resume_data["project_length_suggestion"] = project_length_suggestion
experience_text = sections_text.get("WORK EXPERIENCE", "") # β
Extract work experience section
relevant_experience_score = self.calculate_relevant_experience_score(experience_text) # β
Calculate score
# β
Store in the final resume data output
resume_data["relevant_experience_score"] = relevant_experience_score
section_grammar_check_issues = self.grammar_check(sections_text.keys())
recommended_blogs = random.sample(blog_articles, 2)
recommended_youtube = random.sample(youtube_links, 2)
name_score = self.calculate_name_score(name)
contact_score = self.calculate_contact(contact_number)
email_score = self.calculate_email(email)
github_url_score = self.calculate_github_url_score(github_urls)
# Calculate imarticus_score
imarticus_score = self.imarticus_review_score(
name,
contact_number,
email,
linkedin_urls,
github_urls,
missing_sections,
sections_not_capitalized,
common_projects=found_projects, # Ensure to pass found projects
section_order_suggestion=experience_order_suggestion,
sections_text=sections_text,
skills=skills_found,
relevant_experience_score=relevant_experience_score,
#pdf_path=path
#relevant_keywords_found=bool(found_keywords), # Convert to boolean
#experience_orderly_arranged=experience_order_suggestion, # Pass orderly arrangement check
#experience_section_present="WORK EXPERIENCE" in sections_text # Check if experience section is present
)
# Populate resume data dictionary
resume_data = {
"name": name,
"contact_number": contact_number,
"email": email,
"linkedin_urls": linkedin_urls,
"experience_order_suggestion": experience_order_suggestion,
"education_order_suggestion": education_order_suggestion,
"grammer_issues_by_section": section_by_grammer_issues,
"github_urls": github_urls,
"skills": skills_found,
"spelling_suggestions": spelling_suggestions,
"found_keywords": found_keywords,
"text": text,
"font_suggestions": font_suggestions,
"name_suggestion": name_suggestion,
"contact_suggestion": contact_suggestion,
"email_suggestion": email_suggestion,
"github_urls_suggestions": github_urls_suggestions,
"linkedin_urls_suggestion": "Add the LinkedIn URLs to the resume." if not linkedin_urls else "",
"missing_sections": missing_sections,
"common_projects": "Common projects found in Projects section: " + ", ".join(found_projects) if found_projects else "",
"project_length_suggestion": project_length_suggestion,
"section_grammar_check_issues": section_grammar_check_issues,
"imarticus_score": imarticus_score, # Add the score to resume data
"extra_urls": extra_urls,
"certifications": {
"found": found_imarticus_certification["found"],
"message": found_imarticus_certification["message"],
"text": certifications_text # Store extracted certification text
},
"recommended_blogs": recommended_blogs,
"recommended_youtube_links": recommended_youtube,
"name_score":name_score,
"contact_score":contact_score,
"email_score":email_score,
"github_urls_score":github_url_score
}
# Additional checks and data additions
if "WORK EXPERIENCE" in sections_text.keys() and "WORK EXPERIENCE" != list(sections_text.keys())[2]:
section_order_suggestion = f"WORK EXPERIENCE should come before {list(sections_text.keys())[2]}"
resume_data["section_order_suggestion"] = section_order_suggestion
missing_important_sections = self.check_missing_sections(resume_data)
resume_data["basic_information_section"] = missing_important_sections or "Basic information is Found"
missing_skills = list(set(essential_skills) - set(skills_found))
resume_data["missing_skills"] = missing_skills
found_keywords_count = len(resume_data["found_keywords"])
num_keywords = len(keyword_variations)
quality_mapping = {"Low": 0.2, "Medium": 0.5, "High": 0.8} # Assuming some quality mapping
for quality, threshold in quality_mapping.items():
if found_keywords_count < num_keywords * threshold:
resume_data["quality"] = quality
break
found_certification = "Imarticus certification found in Certifications section." if found_imarticus_certification else "No Imarticus certification found in Certifications section."
resume_data["found_certification"] = found_certification
# Experience relevance check
Extract_exp_sections = ['WORK EXPERIENCE']
experience_text = self.extract_and_format_sections(sections_text, Extract_exp_sections)
if experience_text:
resume_data["work_experience_check"] = "Experience is relevant to Data science." if any(variation.lower() in experience_text.lower() for keyword, variations in keyword_variations.items() for variation in variations) else "Experience is not relevant to Data science."
return jsonify(resume_data)
|