Spaces:
Sleeping
Sleeping
| #total score = 67 | |
| # python file to parse different section from resume | |
| from pdfminer.high_level import extract_pages, extract_text | |
| from pdfminer.layout import LTTextContainer, LTChar, LTTextLineHorizontal | |
| from collections import defaultdict | |
| from flask import jsonify | |
| import re, fitz, requests, logging, datetime | |
| import src.config as config | |
| from .config import data_science_skills, keyword_variations, essential_skills, quality_mapping, Extract_sections, suggested_projects, ignore_rule_ids | |
| from .config import required_sections, linkedin_domain, github_domain, basic_informations, section_headers, common_projects, ignore_error_keywords,blog_articles,youtube_links | |
| from .config import kaggle_domain,hackerrank_domain,leetcode_domain,medium_domain | |
| from spacy.matcher import Matcher | |
| import language_tool_python | |
| from collections import defaultdict | |
| import random | |
| tool = language_tool_python.LanguageTool('en-US') | |
| class ResumeParser: | |
| def extract_contact_number_from_resume(self, text): | |
| contact_number = None | |
| suggestion = "" | |
| # Use regex pattern to find a potential contact number | |
| pattern = r"\b(?:\+?\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b" | |
| match = re.search(pattern, text) | |
| if match: | |
| contact_number = match.group() | |
| # Check if the contact number is of the correct length | |
| digits_only = re.sub(r'\D', '', contact_number) | |
| if len(digits_only) == 10: | |
| suggestion = "" | |
| elif len(digits_only) > 10 and digits_only.startswith('91') and len(digits_only[2:]) == 10: | |
| suggestion = "" | |
| else: | |
| suggestion = "Contact number should have exactly 10 digits." | |
| return contact_number, suggestion | |
| def extract_hyperlinks(self, pdf_path): | |
| doc = fitz.open(pdf_path) | |
| links = [] | |
| for page_num in range(len(doc)): | |
| page = doc.load_page(page_num) | |
| link_list = page.get_links() | |
| for link in link_list: | |
| uri = link.get('uri', None) | |
| if uri: | |
| links.append(uri) | |
| return links | |
| def extract_text_from_pdf(self, pdf_path): | |
| return extract_text(pdf_path) | |
| def extract_email_from_text(self, text): | |
| pattern = r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b" | |
| match = re.search(pattern, text) | |
| if match: | |
| return match.group() | |
| return None | |
| def extract_email_from_resume(self, pdf_path): | |
| text = self.extract_text_from_pdf(pdf_path) | |
| email = self.extract_email_from_text(text) | |
| suggestion = "" | |
| # If no email found in text, check hyperlinks | |
| if not email: | |
| links = self.extract_hyperlinks(pdf_path) | |
| for link in links: | |
| if link.startswith('mailto:'): | |
| email_candidate = link.split('mailto:')[1] | |
| if self.is_valid_email(email_candidate): | |
| email = email_candidate | |
| break | |
| # Additional validation for email found in text or links | |
| if email and not self.is_valid_email(email): | |
| suggestion += "Your email address doesn't seem to be valid. Please check and correct." | |
| return email, suggestion | |
| def is_valid_email(self, email): | |
| # Length check | |
| if len(email) > 254: | |
| return False | |
| # Consecutive special characters check | |
| if re.search(r"[._%+-]{2,}", email): | |
| return False | |
| # Domain part validation | |
| domain_part = email.split('@')[1] | |
| if not re.match(r"[A-Za-z0-9.-]+\.[A-Za-z]{2,}", domain_part): | |
| return False | |
| # Standard email format check | |
| pattern = r"^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$" | |
| return re.match(pattern, email) is not None | |
| def extract_sections_from_resume(self, text): | |
| missing_sections = [] | |
| sections_not_capitalized = [] | |
| for section in required_sections: | |
| pattern = r"\b{}\b".format(re.escape(section)) | |
| match_obj = re.search(pattern, text, re.IGNORECASE) | |
| if not match_obj: | |
| missing_sections.append(section) | |
| else: | |
| if match_obj.group() not in map(str.upper, required_sections): | |
| sections_not_capitalized.append(section) | |
| return missing_sections, sections_not_capitalized | |
| def extract_skills_from_resume(self, text): | |
| if not isinstance(text, str): | |
| raise ValueError(f"Expected 'text' to be a string, but got {type(text)}") | |
| skills = [] | |
| for skill in essential_skills: | |
| pattern = r"\b{}\b".format(re.escape(skill)) | |
| match = re.search(pattern, text, re.IGNORECASE) | |
| if match: | |
| skills.append(skill) | |
| return skills | |
| def extract_keyword_variations_from_resume(self, text): | |
| found_keywords = [] | |
| for keyword, variations in keyword_variations.items(): | |
| for variation in variations: | |
| if variation.lower() in text.lower(): | |
| found_keywords.append(variation) | |
| break | |
| return found_keywords | |
| def extract_keyword_variations_from_formatted_text(self, formatted_text): | |
| found_keyword_section = [] | |
| for keyword, variations in keyword_variations.items(): | |
| for variation in variations: | |
| if variation.lower() in formatted_text.lower(): | |
| found_keyword_section.append(variation) | |
| break | |
| return found_keyword_section | |
| def extract_linkedIn_urls_from_pdf(self, pdf_path): | |
| linkedin_urls = None | |
| pdf_document = fitz.open(pdf_path) | |
| for page_num in range(len(pdf_document)): | |
| page = pdf_document.load_page(page_num) | |
| links = page.get_links() | |
| for link in links: | |
| url = link.get('uri', '') | |
| if re.search(linkedin_domain, url): | |
| linkedin_urls = url | |
| pdf_document.close() | |
| return linkedin_urls | |
| def extract_github_urls_from_pdf(self, pdf_path): | |
| github_urls = None | |
| pdf_document = fitz.open(pdf_path) | |
| for page_num in range(len(pdf_document)): | |
| page = pdf_document.load_page(page_num) | |
| links = page.get_links() | |
| for link in links: | |
| url = link.get('uri', '') | |
| if re.search(github_domain, url): | |
| path = re.sub(github_domain, '', url) | |
| parts = path.split('/') | |
| if len(parts) == 1: | |
| github_urls = url | |
| pdf_document.close() | |
| return github_urls | |
| def extract_extra_urls_pdf(self,pdf_path, domains): | |
| extracted_urls = defaultdict(set) | |
| try: | |
| # Open the PDF document | |
| pdf_document = fitz.open(pdf_path) | |
| # Iterate through all pages in the PDF | |
| for page_num in range(len(pdf_document)): | |
| page = pdf_document.load_page(page_num) | |
| links = page.get_links() | |
| for link in links: | |
| url = link.get('uri', '') | |
| if url: # Ensure there's a URL | |
| for domain in domains: | |
| if re.search(domain, url, re.IGNORECASE): | |
| extracted_urls[domain].add(url) # Add URL to the domain's set | |
| except Exception as e: | |
| print(f"Error processing PDF: {e}") | |
| finally: | |
| pdf_document.close() | |
| return {domain: list(urls) for domain, urls in extracted_urls.items()} | |
| def is_valid_url(self , github_urls ): | |
| suggest = "" | |
| for _ in [github_urls]: | |
| if not github_urls: | |
| break | |
| try: | |
| response = requests.head(github_urls) | |
| if response.status_code != 200: | |
| suggest = "GitHub URL is not valid, please check and correct. " | |
| except requests.RequestException: | |
| suggest = "GitHub URL is not valid, please check and correct. " | |
| return suggest | |
| return suggest | |
| def is_valid_name(self, name): | |
| if any(char.isdigit() for char in name): | |
| return False | |
| if len(name.split()) > 3: | |
| return False | |
| common_non_names = {"Email", "Github", "LinkedIn", "Portfolio", "Data Analyst"} | |
| if name in common_non_names: | |
| return False | |
| return True | |
| def extract_name(self, resume_text): | |
| lines = resume_text.split('\n') | |
| # Use regex to find lines that likely contain names | |
| name_lines = [line for line in lines if re.match(r'^[A-Za-z]*\s[A-Za-z]*', line.strip())] | |
| names = [] | |
| for i in range(len(name_lines)): | |
| if self.is_valid_name(name_lines[i].strip()): | |
| names.append(name_lines[i].strip()) | |
| if len(names) >= 1: | |
| name = names[0] | |
| suggestion = "" | |
| # Check if the name parts contain only alphabetic characters | |
| name_parts = name.split() | |
| if any(part[0].islower() for part in name_parts): | |
| suggestion += " name should start with a capital letter. " | |
| return name, suggestion | |
| return None, "No valid name found" | |
| def check_missing_sections(self, resume_data): | |
| missing_information = [] | |
| for section in basic_informations: | |
| if not resume_data.get(section): | |
| missing_information.append(section) | |
| return missing_information | |
| def segregate_sections(self, text): | |
| header_pattern = re.compile(rf'^\s*({"|".join(re.escape(header) for header in section_headers)}):?\s*$', re.IGNORECASE) | |
| sections_text = {} | |
| current_section = None | |
| lines = text.splitlines() | |
| for line in lines: | |
| clean_line = line.strip() | |
| match = header_pattern.match(clean_line) | |
| if match: | |
| current_section = match.group(1).upper() | |
| sections_text[current_section] = [] | |
| elif current_section: | |
| sections_text[current_section].append(line.strip()) | |
| return sections_text | |
| def extract_and_format_sections(self, sections_text, Extract_sections): | |
| formatted_text = "" | |
| for section in Extract_sections: | |
| if section in sections_text: | |
| section_content = " ".join(sections_text[section]).replace('\n', ' ') | |
| formatted_text += f"{section}:\n{section_content}\n\n" | |
| return formatted_text | |
| def replace_keywords_with_placeholders(self, formatted_text, found_keyword_section): | |
| placeholder_text = formatted_text | |
| keyword_placeholders = {} | |
| # Use a set to avoid duplicates and keep track of keyword placeholders | |
| used_keywords = set() | |
| for i, keyword in enumerate(found_keyword_section): | |
| if keyword not in used_keywords: | |
| used_keywords.add(keyword) | |
| placeholder = f"{{KEYWORD_{i}}}" | |
| keyword_placeholders[placeholder] = keyword | |
| # Using word boundary to match whole words | |
| placeholder_text = re.sub(r'\b' + re.escape(keyword) + r'\b', placeholder, placeholder_text, flags=re.IGNORECASE) | |
| return placeholder_text, keyword_placeholders | |
| def replace_placeholders_with_keywords(self, grammar_issues, keyword_placeholders): | |
| updated_issues = [] | |
| for issue in grammar_issues: | |
| context = issue['context'] | |
| for placeholder, keyword in keyword_placeholders.items(): | |
| context = context.replace(placeholder, keyword) | |
| # Update the context in the issue dictionary | |
| issue['context'] = context | |
| updated_issues.append(issue) | |
| return updated_issues | |
| def grammar_check(self, placeholder_text): | |
| matches = tool.check(placeholder_text) | |
| grammar_issues = [] | |
| for match in matches: | |
| issue = { | |
| "context": match.context, | |
| "error": match.message, | |
| "rule_id": match.ruleId, | |
| "suggested_correction": match.replacements | |
| } | |
| grammar_issues.append(issue) | |
| return grammar_issues | |
| def filter_grammar_issues(self, grammar_issues, ignore_rule_ids=None, ignore_error_keywords=None): | |
| if ignore_rule_ids is None: | |
| ignore_rule_ids = [] | |
| if ignore_error_keywords is None: | |
| ignore_error_keywords = [] | |
| filtered_issues = [] | |
| for issue in grammar_issues: | |
| if issue['rule_id'] not in ignore_rule_ids and not any(keyword in issue['error'] for keyword in ignore_error_keywords): | |
| filtered_issues.append(issue) | |
| return filtered_issues | |
| def process_resume(self, text, found_keyword_section, Extract_sections): | |
| sections_text = self.segregate_sections(text) | |
| formatted_text = self.extract_and_format_sections(sections_text, Extract_sections) | |
| found_keyword_section = self.extract_keyword_variations_from_formatted_text(formatted_text) | |
| placeholder_text, keyword_placeholders = self.replace_keywords_with_placeholders(formatted_text, found_keyword_section) | |
| grammar_issues = self.grammar_check(placeholder_text) | |
| grammar_issues_text = self.replace_placeholders_with_keywords(grammar_issues, keyword_placeholders) | |
| filtered_grammar_issues = self.filter_grammar_issues(grammar_issues, ignore_rule_ids, ignore_error_keywords) | |
| return { | |
| "grammar_issues": filtered_grammar_issues, | |
| "spelling_errors": [issue for issue in filtered_grammar_issues if "SPELLING" in issue['rule_id']] | |
| } | |
| def grammar_issue_check(self, text, found_keyword_section, Extract_sections): | |
| issues = {} | |
| text1 = " ".join(text.split("\n")) | |
| for section in Extract_sections: | |
| grammar_issues = self.process_resume(text, found_keyword_section, [section]) | |
| if not grammar_issues: | |
| grammar_issues = "no error found" | |
| issues[section] = grammar_issues | |
| return issues | |
| def normalize_font_name(self,font_name): | |
| if '-' in font_name: | |
| font_name = font_name.split('-')[0] | |
| if '+' in font_name: | |
| font_name = font_name.split('+')[1] | |
| return font_name | |
| def extract_text_properties(self, pdf_path, predefined_terms): | |
| text_properties = [] | |
| current_phrase = "" | |
| current_font_size = None | |
| current_font_name = None | |
| current_page_num = None | |
| special_characters = set("ββͺβ’!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~") | |
| def add_current_phrase(): | |
| nonlocal current_phrase | |
| if current_phrase.strip(): | |
| flag = any(current_phrase in term for term in predefined_terms) | |
| if not flag: | |
| text_properties.append({ | |
| "text": current_phrase, | |
| "font_size": current_font_size, | |
| "font_name": current_font_name, | |
| "page_num": current_page_num | |
| }) | |
| current_phrase = "" | |
| for page_layout in extract_pages(pdf_path): | |
| for element in page_layout: | |
| if isinstance(element, LTTextContainer): | |
| for text_line in element: | |
| if isinstance(text_line, LTTextLineHorizontal): | |
| for character in text_line: | |
| if isinstance(character, LTChar): | |
| text = character.get_text() | |
| font_size = round(character.size, 2) | |
| font_name = self.normalize_font_name(character.fontname) | |
| page_num = page_layout.pageid | |
| if text.isspace() or text in special_characters: | |
| add_current_phrase() | |
| continue | |
| if (font_size != current_font_size or font_name != current_font_name or | |
| page_num != current_page_num): | |
| add_current_phrase() | |
| current_font_size = font_size | |
| current_font_name = font_name | |
| current_page_num = page_num | |
| current_phrase += text | |
| add_current_phrase() | |
| return text_properties | |
| def group_similar_fonts(self,text_properties, tolerance=0.5): | |
| grouped_properties = defaultdict(list) | |
| for prop in text_properties: | |
| rounded_size = round(prop["font_size"] / tolerance) * tolerance | |
| key = (prop["font_name"], rounded_size) | |
| grouped_properties[key].append(prop) | |
| return grouped_properties | |
| def identify_different_fonts_and_sizes(self, grouped_properties): | |
| most_common_group = max(grouped_properties.values(), key=len) | |
| most_common_key = None | |
| for key, group in grouped_properties.items(): | |
| if group == most_common_group: | |
| most_common_key = key | |
| break | |
| different_texts = [] | |
| for key, group in grouped_properties.items(): | |
| if group != most_common_group: | |
| for prop in group: | |
| reason = [] | |
| if key[1] != most_common_key[1]: | |
| reason.append(f"size not {most_common_key[1]}") | |
| if key[0] != most_common_key[0]: | |
| reason.append(f"font not {most_common_key[0]}") | |
| different_texts.append({ | |
| "page_num": prop['page_num'], | |
| "text": prop['text'], | |
| "found_size": prop['font_size'], | |
| "found_font_name": prop['font_name'], | |
| "reason": ", ".join(reason) | |
| }) | |
| return different_texts | |
| def parse_dates(self, sections_text, section_name): | |
| # Check if the section is in the text | |
| suggest = "" | |
| # Define the date patterns to match various date formats | |
| date_pattern = ( | |
| r'\b\d{1,2}/\d{4}\b|' # MM/YYYY | |
| r'\b(?:jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|jul(?:y)?|aug(?:ust)?|sep(?:tember)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?)\s+\d{4}\b|' # Month YYYY | |
| r'\b(?:jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|jul(?:y)?|aug(?:ust)?|sep(?:tember)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?)\s+\d{1,2},?\s*\d{4}\b|' # Month DD, YYYY | |
| r'\b\d{4}\b|' # YYYY | |
| r'\b(?:jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|jul(?:y)?|aug(?:ust)?|sep(?:tember)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?)[a-z]*/?\d{4}\b|' # Month/YYYY | |
| r'\b(?:jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|jul(?:y)?|aug(?:ust)?|sep(?:tember)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?)[a-z]*\d{4}\s*-\s*(?:jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|jul(?:y)?|aug(?:ust)?|sep(?:tember)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?)[a-z]*\d{4}\b' # Month/YYYY - Month/YYYY | |
| ) | |
| all_dates = [] | |
| # Iterate over the entries in the section_name | |
| for entry in sections_text[section_name]: | |
| entry = entry.lower() | |
| matches = re.findall(date_pattern, entry) | |
| if matches and len(matches)>1: | |
| if len(matches) == 2: | |
| all_dates.append(f"{matches[0]} {matches[1]}") | |
| else: | |
| all_dates.extend(matches) | |
| return all_dates | |
| def convert_to_date(self, date_str): | |
| # Mapping of month names and abbreviations to their numeric equivalents | |
| month_map = { | |
| 'jan': 1, 'january': 1, 'feb': 2, 'february': 2, | |
| 'mar': 3, 'march': 3, 'apr': 4, 'april': 4, | |
| 'may': 5, 'jun': 6, 'june': 6, 'jul': 7, | |
| 'july': 7, 'aug': 8, 'august': 8, 'sep': 9, | |
| 'september': 9, 'oct': 10, 'october': 10, | |
| 'nov': 11, 'november': 11, 'dec': 12, 'december': 12, | |
| '01': 1, '02': 2, '03': 3, '04': 4, | |
| '05': 5, '06': 6, '07': 7, '08': 8, | |
| '09': 9, '10': 10, '11': 11, '12': 12 | |
| } | |
| # Regex patterns to match different date formats | |
| pattern_mm_yyyy = re.compile(r'(\d{1,2})/(\d{4})') | |
| pattern_mm_yyyy_space = re.compile(r'(\d{1,2})\s(\d{4})') | |
| pattern_month_yyyy = re.compile(r'([a-zA-Z]+)\s?(\d{4})') | |
| pattern_yyyy = re.compile(r'(\d{4})') | |
| def extract_date(date_str): | |
| match_mm_yyyy = pattern_mm_yyyy.match(date_str) | |
| match_mm_yyyy_space = pattern_mm_yyyy_space.match(date_str) | |
| match_month_yyyy = pattern_month_yyyy.match(date_str) | |
| match_yyyy = pattern_yyyy.match(date_str) | |
| if match_mm_yyyy: | |
| month = int(match_mm_yyyy.group(1)) | |
| year = int(match_mm_yyyy.group(2)) | |
| elif match_mm_yyyy_space: | |
| month = int(match_mm_yyyy_space.group(1)) | |
| year = int(match_mm_yyyy_space.group(2)) | |
| elif match_month_yyyy: | |
| month = month_map.get(match_month_yyyy.group(1).lower()) | |
| year = int(match_month_yyyy.group(2)) | |
| elif match_yyyy: | |
| month = 1 | |
| year = int(match_yyyy.group(1)) | |
| else: | |
| return [] | |
| return datetime.date(year, month, 1) | |
| date_parts = re.findall(r'(\d{4}\s[a-zA-Z]+\s?|\d{4}[a-zA-Z]+|\d{4}\/\d{2}|\d{4}\s\d{2}|[a-zA-Z]+\s?\d{4}|\d{4}\s[a-zA-Z]+)', date_str) | |
| if len(date_parts) == 1: | |
| # Standalone year or single date | |
| start_date = extract_date(date_parts[0]) | |
| end_date = datetime.date(start_date.year, start_date.month, start_date.day) | |
| elif len(date_parts) == 2: | |
| # Date range | |
| start_date = extract_date(date_parts[0]) | |
| end_date = extract_date(date_parts[1]) | |
| else: | |
| return [] | |
| return start_date, end_date | |
| def date_time(self, date_parts): | |
| converted_dates = [] | |
| for date_part in date_parts: | |
| start_date, end_date = self.convert_to_date(date_part) | |
| converted_dates.append((start_date, end_date)) | |
| return converted_dates | |
| def check_chronological_order(self, converted_dates, section_name ): | |
| suggestion = "" | |
| sorted_dates = sorted(converted_dates, key=lambda x: (x[1], x[0]), reverse=True) | |
| if converted_dates == sorted_dates: | |
| suggestion = f"{section_name} section is in chronological order." | |
| else: | |
| suggestion = f"{section_name} section is not in chronological order." | |
| return suggestion | |
| def check_common_projects(self, projects_text): | |
| found_projects = [] | |
| for project in common_projects: | |
| if project.lower() in projects_text.lower(): | |
| found_projects.append(project) | |
| return found_projects | |
| def recommend_resources(): | |
| # Randomly pick 2 blog articles and 2 YouTube links | |
| recommended_blogs = random.sample(blog_articles, 2) | |
| recommended_youtube = random.sample(youtube_links, 2) | |
| # Return the recommendations | |
| return { | |
| "Recommended Blogs": recommended_blogs, | |
| "Recommended YouTube Links": recommended_youtube | |
| } | |
| def check_imarticus_certifications(self, certifications_text): | |
| # Check if "imarticus" is present in the certifications text | |
| if "imarticus" in certifications_text.lower(): | |
| return { | |
| "found": True, | |
| "message": "Imarticus certification found. Please upload it in the academic section." | |
| } | |
| return { | |
| "found": False, | |
| "message": "No Imarticus certification found in the provided text." | |
| } | |
| def chronological_order_check(self, sections_text, section_name): | |
| order_suggestion = "" | |
| suggestion = "" | |
| section_name = section_name.upper() | |
| if section_name in sections_text: | |
| date = self.parse_dates(sections_text, section_name) | |
| if date: | |
| converted_dates = self.date_time(date) | |
| order_suggestion = self.check_chronological_order(converted_dates, section_name) | |
| else: | |
| suggestion = f"No valid dates found in {section_name} section. " | |
| else: | |
| suggestion = f"{section_name} is not in section header. " | |
| return order_suggestion, suggestion | |
| # Function to check for spelling mistakes | |
| def check_spelling(self, headers, section_headers): | |
| suggestions = [] | |
| for header in headers: | |
| if header.upper() not in map(str.upper, section_headers): | |
| suggestions = header | |
| return suggestions | |
| def is_present_name(name): | |
| """ | |
| Checks if a given name has at least 2 words. | |
| Args: | |
| name: The name string to check. | |
| Returns: | |
| True if it has at least 2 words, false otherwise. | |
| """ | |
| parts = name.split() | |
| return len(parts) >= 2 | |
| def is_sentence_case(name): | |
| parts = name.split() # Split into individual words | |
| for part in parts: | |
| if not part: # handles empty strings in name | |
| continue | |
| if not part[0].isupper() or not part[1:].islower(): | |
| return False # Check if first letter is uppercase and rest are lowercase | |
| return True | |
| def is_present_name(self,name): | |
| parts = name.split() | |
| return len(parts) >= 2 | |
| def is_sentence_case(self,name): | |
| parts = name.split() | |
| for part in parts: | |
| if not part: | |
| continue | |
| if not part[0].isupper() or not part[1:].islower(): | |
| return False | |
| return True | |
| def extract_project_links(self,sections_text): | |
| project_links = {} | |
| if "PROJECTS" in sections_text: | |
| project_list = sections_text.get("PROJECTS", []) | |
| url_pattern = r"https?://[^\s]+" | |
| for project in project_list: | |
| links = re.findall(url_pattern,project) | |
| if links: | |
| project_links[project] = links | |
| return project_links | |
| def count_sentences(self,text): | |
| sentence_endings = r"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|!)\s" | |
| sentences = re.split(sentence_endings, text) | |
| sentences = [s.strip() for s in sentences if s.strip()] | |
| return len(sentences) | |
| def calculate_summary_score(self, summary): | |
| score = 0 # Initialize score | |
| if not summary: | |
| return score | |
| num_sentences = self.count_sentences(summary) | |
| if num_sentences <= 2: | |
| return 2 | |
| elif num_sentences > 2 and num_sentences <= 4: | |
| return 3 | |
| elif num_sentences > 4: | |
| return 5 | |
| else: | |
| return 0 | |
| def calculate_extra_urls_bonus(self,pdf_path): | |
| domains = [ | |
| r"hackerrank\.com", # Hackerrank | |
| r"leetcode\.com", # LeetCode | |
| r"medium\.com" # Medium | |
| ] | |
| extra_urls = self.extract_extra_urls_pdf(pdf_path, domains) | |
| has_extra_urls = any(urls for urls in extra_urls.values()) | |
| return 5 if has_extra_urls else 0 | |
| def calculate_relevant_experience_score(self, experience_text): | |
| """ | |
| Assigns a score based on the presence of relevant experience keywords. | |
| Args: | |
| experience_text (str): The extracted work experience section text. | |
| Returns: | |
| int: A score of 5 if relevant keywords are found, otherwise 0. | |
| """ | |
| if not experience_text: | |
| return 0 # β No experience section β Score 0 | |
| if isinstance(experience_text, list): | |
| experience_text = " ".join(experience_text) # β Convert list to a single string | |
| experience_text = experience_text.strip().lower() # β Ensure it's a string and lowercase | |
| # β Check if any keyword from 'data_science_skills' or 'essential_skills' exists | |
| for skill in config.data_science_skills + config.essential_skills: | |
| if skill.lower() in experience_text: | |
| return 5 # β Found relevant experience β Full score | |
| return 0 | |
| def calculate_ds_skills_score(self, skills_present): | |
| if not skills_present: # No skills found at all | |
| return 0 | |
| # Use skills from config instead of hardcoded list | |
| ds_skills_list_lower = [skill.lower() for skill in config.data_science_skills] | |
| skills_present_lower = [skill.lower() for skill in skills_present] | |
| matching_count = sum(1 for skill in skills_present_lower | |
| if skill in ds_skills_list_lower) | |
| if matching_count == 0: # Skills found but none match DS list | |
| return 2 | |
| elif 1 <= matching_count <= 5: | |
| return 3 | |
| elif matching_count > 5: | |
| return 5 | |
| return 0 | |
| def calculate_project_link_score(self, projects_with_links): | |
| """ | |
| Assigns a score based on whether project links are present. | |
| Args: | |
| projects_with_links (int): The number of projects with links. | |
| Returns: | |
| int: 2 if project links are found, otherwise 0. | |
| """ | |
| return 2 if projects_with_links > 0 else 0 | |
| def imarticus_review_score(self,name,contact_number,email,linkedin_urls,github_url,missing_sections,sections_not_capitalized,common_projects,section_order_suggestion,sections_text,skills,relevant_experience_score): | |
| score = 0 | |
| if name: | |
| name_parts = name.split() | |
| num_parts = len(name_parts) | |
| if num_parts == 0: | |
| score += 0 | |
| if self.is_sentence_case(name): | |
| score += 3 | |
| elif self.is_present_name(name): | |
| score += 1.5 | |
| if contact_number and isinstance(contact_number, str): | |
| digits_only = re.sub(r'\D', '', contact_number) | |
| if digits_only.startswith("91") and len(digits_only) > 10: | |
| digits_only = digits_only[2:] # Remove the first two characters ('91') | |
| if len(digits_only) == 10 and digits_only[0] in "6789": # Check for valid Indian mobile numbers | |
| score += 3 | |
| if email: | |
| score += 3 if self.is_valid_email(email) else 0 | |
| score += 3 if linkedin_urls else 0 | |
| if github_url: | |
| github_suggestion = self.is_valid_url(github_url) | |
| score += 3 if not github_suggestion else 0 | |
| else: | |
| score += 0 | |
| if len(missing_sections)==0 and len(sections_not_capitalized)==0: | |
| score+=10 | |
| elif len(missing_sections)==0 and len(sections_not_capitalized)>0: | |
| score+=8 | |
| elif len(missing_sections)<=3: | |
| score+=6 | |
| elif len(missing_sections)>4: | |
| score+=3 | |
| if common_projects: | |
| score +=0 | |
| else: | |
| score +=5 | |
| if section_order_suggestion: | |
| score -= 2 | |
| else: | |
| score | |
| """ | |
| ds_skills_list_lower = [skill.lower() for skill in data_science_skills] | |
| skills_present_lower = [skill.lower() for skill in self.extract_skills_from_resume(skills) ] | |
| matching_skill_count = 0 | |
| for skill in skills_present_lower: | |
| if ds_skills_list_lower: | |
| matching_skill_count+=1 | |
| if matching_skill_count==0: | |
| score+=0 | |
| if matching_skill_count<=5: | |
| score+=2 | |
| elif matching_skill_count>=10 and matching_skill_count<=15: | |
| score+5 | |
| else: | |
| score+=8 | |
| """ | |
| if "PROJECTS" not in sections_text: | |
| score+=0 | |
| else: | |
| project_list = sections_text.get("PROJECTS",[]) | |
| project_count = len([x for x in project_list if "Description" in x]) | |
| if project_count<=2: | |
| score+=2 | |
| elif project_count>2 and project_count<=4: | |
| score+=5 | |
| elif project_count>4: | |
| score+=3 | |
| resume_data = {} | |
| # Extract projects & links | |
| project_links = self.extract_project_links(sections_text) | |
| projects_with_links = len(project_links) | |
| # β Count only projects with descriptions | |
| valid_projects = [ | |
| p for p in sections_text.get("PROJECTS", []) if "description" in p.lower() | |
| ] | |
| total_projects = len(valid_projects) # β Count projects properly | |
| # β Calculate project link score | |
| project_link_score = self.calculate_project_link_score(projects_with_links) | |
| resume_data["project_link_score"] = project_link_score | |
| # β Prevent division by zero | |
| if total_projects > 0: | |
| if projects_with_links == 0: | |
| score += 0 | |
| elif projects_with_links / total_projects >= 0.5: | |
| score += 1.5 | |
| if projects_with_links == total_projects: | |
| score += 3 | |
| else: | |
| score += 0 # β Ensure no division error if no projects exist | |
| """" | |
| profile_summary = sections_text.get("PROFILE SUMMARY", "") | |
| print(profile_summary) | |
| summary_score = self.calculate_summary_score(profile_summary) | |
| score += summary_score | |
| """ | |
| ds_skills_score = self.calculate_ds_skills_score(skills) | |
| score += ds_skills_score | |
| certifications = sections_text.get("CERTIFICATIONS & ACADEMIC ENDEAVOURS", []) | |
| num_certifications = len(certifications) | |
| if num_certifications==0: | |
| score+=0 | |
| elif 0 < num_certifications <= 2: | |
| score+=3 | |
| elif 2 < num_certifications <= 4: | |
| score+=5 | |
| elif num_certifications>4: | |
| score+=7 | |
| """ | |
| extra_urls_bonus = self.calculate_extra_urls_bonus(pdf_path) | |
| score += extra_urls_bonus | |
| """ | |
| score += relevant_experience_score | |
| score += project_link_score | |
| return score | |
| def imarticus_detailed_score(self, name, contact_number, email, linkedin_urls, github_url, | |
| missing_sections=None, sections_not_capitalized=None, common_projects=None, | |
| section_order_suggestion=None, sections_text=None, skills=None, | |
| relevant_experience_score=0): | |
| # Ensure lists and dictionaries have default values to avoid 'NoneType' errors | |
| missing_sections = missing_sections or [] | |
| sections_not_capitalized = sections_not_capitalized or [] | |
| common_projects = common_projects or [] | |
| sections_text = sections_text or {} | |
| score_breakdown = { | |
| "name_score": 0, | |
| "contact_number_score": 0, | |
| "email_score": 0, | |
| "linkedin_url_score": 0, | |
| "github_url_score": 0, | |
| "missing_sections_score": 0, | |
| "common_projects_score": 0, | |
| "section_order_score": 0, | |
| "projects_score": 0, | |
| "certifications_score": 0, | |
| "relevant_experience_score": 0, | |
| "ds_skills_score": 0, | |
| "extra_urls_bonus": 0, | |
| "summary_score": 0, | |
| "project_link_score": 0 | |
| } | |
| # β Name Score (3 Points) | |
| if name: | |
| if self.is_sentence_case(name): | |
| score_breakdown["name_score"] = 3 | |
| elif self.is_present_name(name): | |
| score_breakdown["name_score"] = 1.5 | |
| # β Contact Number Score (3 Points) | |
| if contact_number and isinstance(contact_number, str): | |
| digits_only = re.sub(r'\D', '', contact_number) | |
| if digits_only.startswith("91") and len(digits_only) > 10: | |
| digits_only = digits_only[2:] | |
| if len(digits_only) == 10 and digits_only[0] in "6789": | |
| score_breakdown["contact_number_score"] = 3 | |
| # β Email Score (3 Points) | |
| score_breakdown["email_score"] = 3 if email and self.is_valid_email(email) else 0 | |
| # β LinkedIn URL Score (3 Points) | |
| score_breakdown["linkedin_url_score"] = 3 if linkedin_urls else 0 | |
| # β GitHub URL Score (3 Points) | |
| if github_url and self.is_valid_url(github_url): | |
| score_breakdown["github_url_score"] = 3 | |
| # β Missing Sections Score (10 Points) | |
| if not missing_sections and not sections_not_capitalized: | |
| score_breakdown["missing_sections_score"] = 10 | |
| elif not missing_sections and sections_not_capitalized: | |
| score_breakdown["missing_sections_score"] = 8 | |
| elif len(missing_sections) <= 3: | |
| score_breakdown["missing_sections_score"] = 6 | |
| else: | |
| score_breakdown["missing_sections_score"] = 3 | |
| # β Common Projects Score (5 Points) | |
| score_breakdown["common_projects_score"] = 0 if common_projects else 5 | |
| # β Section Order Score (2 Points) | |
| score_breakdown["section_order_score"] = -2 if section_order_suggestion else 0 | |
| # β Projects Score (5 Points) | |
| if "PROJECTS" in sections_text: | |
| project_list = sections_text.get("PROJECTS", []) | |
| project_count = len([x for x in project_list if "Description" in x]) | |
| if project_count <= 2: | |
| score_breakdown["projects_score"] = 2 | |
| elif 2 < project_count <= 4: | |
| score_breakdown["projects_score"] = 5 | |
| else: | |
| score_breakdown["projects_score"] = 3 | |
| # β Certifications Score (7 Points) | |
| certifications = sections_text.get("CERTIFICATIONS & ACADEMIC ENDEAVOURS", []) | |
| num_certifications = len(certifications) | |
| if num_certifications == 0: | |
| score_breakdown["certifications_score"] = 0 | |
| elif 0 < num_certifications <= 2: | |
| score_breakdown["certifications_score"] = 3 | |
| elif 2 < num_certifications <= 4: | |
| score_breakdown["certifications_score"] = 5 | |
| else: | |
| score_breakdown["certifications_score"] = 7 | |
| # β Relevant Experience Score (5 Points) | |
| score_breakdown["relevant_experience_score"] = relevant_experience_score if relevant_experience_score is not None else 0 | |
| # β Data Science Skills Score (5 Points) | |
| score_breakdown["ds_skills_score"] = self.calculate_ds_skills_score(skills) | |
| # β Extra URLs Bonus (5 Points) | |
| score_breakdown["extra_urls_bonus"] = self.calculate_extra_urls_bonus(sections_text) | |
| # β Summary Score (5 Points) | |
| profile_summary = sections_text.get("PROFILE SUMMARY", "") | |
| score_breakdown["summary_score"] = self.calculate_summary_score(profile_summary) | |
| # β Project Link Score (2 Points) | |
| project_links = self.extract_project_links(sections_text) | |
| projects_with_links = len(project_links) | |
| score_breakdown["project_link_score"] = self.calculate_project_link_score(projects_with_links) | |
| return score_breakdown | |
| def parse_text(self, path): | |
| logger = logging.getLogger(__name__) | |
| logging.getLogger("pdfminer").setLevel(logging.WARNING) | |
| resume_data = {} | |
| logger.debug('parsing text') | |
| text = self.extract_text_from_pdf(path) | |
| text1 = " ".join(text.split("\n")) | |
| skills_found = self.extract_skills_from_resume(text) | |
| found_keywords = self.extract_keyword_variations_from_resume(text) | |
| sections_text = self.segregate_sections(text) | |
| formatted_text = self.extract_and_format_sections(sections_text, Extract_sections) | |
| found_keyword_section = self.extract_keyword_variations_from_formatted_text(formatted_text) | |
| parsed_sections = self.segregate_sections(text) | |
| projects = parsed_sections.get("PROJECTS", []) | |
| certifications = parsed_sections.get("CERTIFICATIONS & ACADEMIC ENDEAVOURS", []) | |
| projects_text = "\n".join(projects) | |
| certifications_text = "\n".join(certifications) | |
| found_imarticus_certification = self.check_imarticus_certifications(certifications_text) | |
| found_projects = self.check_common_projects(projects_text) | |
| name, name_suggestion = self.extract_name(text) | |
| contact_number, contact_suggestion = self.extract_contact_number_from_resume(text) | |
| email, email_suggestion = self.extract_email_from_resume(path) | |
| github_urls = self.extract_github_urls_from_pdf(path) | |
| github_urls_suggestions = self.is_valid_url(github_urls) | |
| linkedin_urls = self.extract_linkedIn_urls_from_pdf(path) | |
| section_by_grammer_issues = self.grammar_issue_check(text, found_keyword_section, Extract_sections) | |
| domains = [ | |
| r"hackerrank\.com", # Hackerrank | |
| r"leetcode\.com", # LeetCode | |
| r"medium\.com" # Medium | |
| ] | |
| extra_urls = self.extract_extra_urls_pdf(path, domains) | |
| education_order_suggestion, education_suggestion = self.chronological_order_check(sections_text, "ACADEMIC PROFILE") | |
| experience_order_suggestion, experience_suggestion = self.chronological_order_check(sections_text, "WORK EXPERIENCE") | |
| headers = list(sections_text.keys()) | |
| spelling_suggestions = self.check_spelling(headers, section_headers) | |
| predefined_terms = [name, email] | |
| predefined_terms.extend(required_sections) | |
| text_properties = self.extract_text_properties(path, predefined_terms) | |
| grouped_properties = self.group_similar_fonts(text_properties) | |
| different_texts = self.identify_different_fonts_and_sizes(grouped_properties) | |
| font_suggestions = [] | |
| for item in different_texts: | |
| font_suggestion = f"Formatting issue at Page: {item['page_num']}, Text: {item['text']}, Reason: {item['reason']}, Found font size: {item['found_size']}, Found font name: {item['found_font_name']}" | |
| font_suggestions.append(font_suggestion) | |
| missing_sections, sections_not_capitalized = self.extract_sections_from_resume(text) | |
| linkedin_urls_suggestion = str() | |
| common_project = str() | |
| if not name: | |
| name_suggestion = "Please add name to the resume." | |
| if not contact_number: | |
| contact_suggestion = "Please add the contact number to the resume." | |
| if not email: | |
| email_suggestion = "Please add the email address to the resume." | |
| if not github_urls: | |
| github_urls_suggestions = "add the github_urls to the resume." | |
| if not linkedin_urls: | |
| linkedin_urls_suggestion = "add the linkedin_urls to the resume." | |
| if found_projects: | |
| common_project = "Common projects found in Projects section: " | |
| for project in found_projects: | |
| common_project += project | |
| # Replace the existing project length suggestion code with: | |
| project_list = sections_text.get("PROJECTS", []) | |
| projects_with_description = [ | |
| p for p in project_list | |
| if "description" in p.lower() | |
| ] | |
| project_count = len(projects_with_description) | |
| if project_count == 0: | |
| project_length_suggestion = "No projects found. Consider at least 2 projects." | |
| elif project_count == 1: | |
| project_length_suggestion = "Only 1 project found. Consider adding 1 more project." | |
| else: | |
| project_length_suggestion = f"{project_count} projects found." | |
| # Store in resume data (keeps your existing URL extraction) | |
| resume_data["project_length_suggestion"] = project_length_suggestion | |
| experience_text = sections_text.get("WORK EXPERIENCE", "") # β Extract work experience section | |
| relevant_experience_score = self.calculate_relevant_experience_score(experience_text) # β Calculate score | |
| # β Store in the final resume data output | |
| resume_data["relevant_experience_score"] = relevant_experience_score | |
| recommended_blogs = random.sample(blog_articles, 2) | |
| recommended_youtube = random.sample(youtube_links, 2) | |
| # Calculate imarticus_score | |
| imarticus_score = self.imarticus_review_score( | |
| name, | |
| contact_number, | |
| email, | |
| linkedin_urls, | |
| github_urls, | |
| missing_sections, | |
| sections_not_capitalized, | |
| common_projects=found_projects, # Ensure to pass found projects | |
| section_order_suggestion=experience_order_suggestion, | |
| skills=skills_found, # Pass order suggestion | |
| sections_text=sections_text, | |
| relevant_experience_score=relevant_experience_score, | |
| # project_link_score=project_link_score | |
| #pdf_path=path | |
| #relevant_keywords_found=bool(found_keywords), # Convert to boolean | |
| #experience_orderly_arranged=experience_order_suggestion, # Pass orderly arrangement check | |
| #experience_section_present="WORK EXPERIENCE" in sections_text # Check if experience section is present | |
| ) | |
| # Populate resume data dictionary | |
| resume_data = { | |
| "name": name, | |
| "contact_number": contact_number, | |
| "email": email, | |
| "linkedin_urls": linkedin_urls, | |
| "experience_order_suggestion": experience_order_suggestion, | |
| "education_order_suggestion": education_order_suggestion, | |
| "grammer_issues_by_section": section_by_grammer_issues, | |
| "github_urls": github_urls, | |
| "skills": skills_found, | |
| "spelling_suggestions": spelling_suggestions, | |
| "found_keywords": found_keywords, | |
| "text": text, | |
| "font_suggestions": font_suggestions, | |
| "name_suggestion": name_suggestion, | |
| "contact_suggestion": contact_suggestion, | |
| "email_suggestion": email_suggestion, | |
| "imarticus_score": imarticus_score, | |
| "github_urls_suggestions": github_urls_suggestions, | |
| "linkedin_urls_suggestion": "Add the LinkedIn URLs to the resume." if not linkedin_urls else "", | |
| "missing_sections": missing_sections, | |
| "common_projects": "Common projects found in Projects section: " + ", ".join(found_projects) if found_projects else "", | |
| "project_length_suggestion": project_length_suggestion, | |
| "extra_urls": extra_urls, | |
| "certifications": { | |
| "found": found_imarticus_certification["found"], | |
| "message": found_imarticus_certification["message"], | |
| "text": certifications_text # Store extracted certification text | |
| }, | |
| "recommended_blogs": recommended_blogs, | |
| "recommended_youtube_links": recommended_youtube | |
| } | |
| # Additional checks and data additions | |
| if "WORK EXPERIENCE" in sections_text.keys() and "WORK EXPERIENCE" != list(sections_text.keys())[2]: | |
| section_order_suggestion = f"WORK EXPERIENCE should come before {list(sections_text.keys())[2]}" | |
| resume_data["section_order_suggestion"] = section_order_suggestion | |
| missing_important_sections = self.check_missing_sections(resume_data) | |
| resume_data["basic_information_section"] = missing_important_sections or "Basic information is Found" | |
| missing_skills = list(set(essential_skills) - set(skills_found)) | |
| resume_data["missing_skills"] = missing_skills | |
| found_keywords_count = len(resume_data["found_keywords"]) | |
| num_keywords = len(keyword_variations) | |
| quality_mapping = {"Low": 0.2, "Medium": 0.5, "High": 0.8} # Assuming some quality mapping | |
| for quality, threshold in quality_mapping.items(): | |
| if found_keywords_count < num_keywords * threshold: | |
| resume_data["quality"] = quality | |
| break | |
| found_certification = "Imarticus certification found in Certifications section." if found_imarticus_certification else "No Imarticus certification found in Certifications section." | |
| resume_data["found_certification"] = found_certification | |
| # Experience relevance check | |
| Extract_exp_sections = ['WORK EXPERIENCE'] | |
| experience_text = self.extract_and_format_sections(sections_text, Extract_exp_sections) | |
| if experience_text: | |
| resume_data["work_experience_check"] = "Experience is relevant to Data science." if any(variation.lower() in experience_text.lower() for keyword, variations in keyword_variations.items() for variation in variations) else "Experience is not relevant to Data science." | |
| return jsonify(resume_data) | |