# python file to parse different section from resume from pdfminer.high_level import extract_pages, extract_text from pdfminer.layout import LTTextContainer, LTChar, LTTextLineHorizontal from collections import defaultdict from flask import jsonify import re, fitz, requests, logging, datetime from .config import data_science_skills, keyword_variations, essential_skills, quality_mapping, Extract_sections, suggested_projects, ignore_rule_ids from .config import required_sections, linkedin_domain, github_domain, basic_informations, section_headers, common_projects, ignore_error_keywords,blog_articles,youtube_links from .config import kaggle_domain,hackerrank_domain,leetcode_domain,medium_domain from spacy.matcher import Matcher import language_tool_python from collections import defaultdict import random tool = language_tool_python.LanguageTool('en-US') class ResumeParser: def extract_contact_number_from_resume(self, text): contact_number = None suggestion = "" # Use regex pattern to find a potential contact number pattern = r"\b(?:\+?\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b" match = re.search(pattern, text) if match: contact_number = match.group() # Check if the contact number is of the correct length digits_only = re.sub(r'\D', '', contact_number) if len(digits_only) == 10: suggestion = "" elif len(digits_only) > 10 and digits_only.startswith('91') and len(digits_only[2:]) == 10: suggestion = "" else: suggestion = "Contact number should have exactly 10 digits." return contact_number, suggestion def extract_hyperlinks(self, pdf_path): doc = fitz.open(pdf_path) links = [] for page_num in range(len(doc)): page = doc.load_page(page_num) link_list = page.get_links() for link in link_list: uri = link.get('uri', None) if uri: links.append(uri) return links def extract_text_from_pdf(self, pdf_path): return extract_text(pdf_path) def extract_email_from_text(self, text): pattern = r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b" match = re.search(pattern, text) if match: return match.group() return None def extract_email_from_resume(self, pdf_path): text = self.extract_text_from_pdf(pdf_path) email = self.extract_email_from_text(text) suggestion = "" # If no email found in text, check hyperlinks if not email: links = self.extract_hyperlinks(pdf_path) for link in links: if link.startswith('mailto:'): email_candidate = link.split('mailto:')[1] if self.is_valid_email(email_candidate): email = email_candidate break # Additional validation for email found in text or links if email and not self.is_valid_email(email): suggestion += "Your email address doesn't seem to be valid. Please check and correct." return email, suggestion def is_valid_email(self, email): # Length check if len(email) > 254: return False # Consecutive special characters check if re.search(r"[._%+-]{2,}", email): return False # Domain part validation domain_part = email.split('@')[1] if not re.match(r"[A-Za-z0-9.-]+\.[A-Za-z]{2,}", domain_part): return False # Standard email format check pattern = r"^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$" return re.match(pattern, email) is not None def extract_sections_from_resume(self, text): missing_sections = [] sections_not_capitalized = [] for section in required_sections: pattern = r"\b{}\b".format(re.escape(section)) match_obj = re.search(pattern, text, re.IGNORECASE) if not match_obj: missing_sections.append(section) else: if match_obj.group() not in map(str.upper, required_sections): sections_not_capitalized.append(section) return missing_sections, sections_not_capitalized def extract_skills_from_resume(self, text): if not isinstance(text, str): raise ValueError(f"Expected 'text' to be a string, but got {type(text)}") skills = [] for skill in essential_skills: pattern = r"\b{}\b".format(re.escape(skill)) match = re.search(pattern, text, re.IGNORECASE) if match: skills.append(skill) return skills def extract_keyword_variations_from_resume(self, text): found_keywords = [] for keyword, variations in keyword_variations.items(): for variation in variations: if variation.lower() in text.lower(): found_keywords.append(variation) break return found_keywords def extract_keyword_variations_from_formatted_text(self, formatted_text): found_keyword_section = [] for keyword, variations in keyword_variations.items(): for variation in variations: if variation.lower() in formatted_text.lower(): found_keyword_section.append(variation) break return found_keyword_section def extract_linkedIn_urls_from_pdf(self, pdf_path): linkedin_urls = None pdf_document = fitz.open(pdf_path) for page_num in range(len(pdf_document)): page = pdf_document.load_page(page_num) links = page.get_links() for link in links: url = link.get('uri', '') if re.search(linkedin_domain, url): linkedin_urls = url pdf_document.close() return linkedin_urls def extract_github_urls_from_pdf(self, pdf_path): github_urls = None pdf_document = fitz.open(pdf_path) for page_num in range(len(pdf_document)): page = pdf_document.load_page(page_num) links = page.get_links() for link in links: url = link.get('uri', '') if re.search(github_domain, url): path = re.sub(github_domain, '', url) parts = path.split('/') if len(parts) == 1: github_urls = url pdf_document.close() return github_urls def extract_extra_urls_pdf(self,pdf_path, domains): extracted_urls = defaultdict(set) try: # Open the PDF document pdf_document = fitz.open(pdf_path) # Iterate through all pages in the PDF for page_num in range(len(pdf_document)): page = pdf_document.load_page(page_num) links = page.get_links() for link in links: url = link.get('uri', '') if url: # Ensure there's a URL for domain in domains: if re.search(domain, url, re.IGNORECASE): extracted_urls[domain].add(url) # Add URL to the domain's set except Exception as e: print(f"Error processing PDF: {e}") finally: pdf_document.close() return {domain: list(urls) for domain, urls in extracted_urls.items()} def is_valid_url(self , github_urls ): suggest = "" for _ in [github_urls]: if not github_urls: break try: response = requests.head(github_urls) if response.status_code != 200: suggest = "GitHub URL is not valid, please check and correct. " except requests.RequestException: suggest = "GitHub URL is not valid, please check and correct. " return suggest return suggest def is_valid_name(self, name): if any(char.isdigit() for char in name): return False if len(name.split()) > 3: return False common_non_names = {"Email", "Github", "LinkedIn", "Portfolio", "Data Analyst"} if name in common_non_names: return False return True def extract_name(self, resume_text): lines = resume_text.split('\n') # Use regex to find lines that likely contain names name_lines = [line for line in lines if re.match(r'^[A-Za-z]*\s[A-Za-z]*', line.strip())] names = [] for i in range(len(name_lines)): if self.is_valid_name(name_lines[i].strip()): names.append(name_lines[i].strip()) if len(names) >= 1: name = names[0] suggestion = "" # Check if the name parts contain only alphabetic characters name_parts = name.split() if any(part[0].islower() for part in name_parts): suggestion += " name should start with a capital letter. " return name, suggestion return None, "No valid name found" def check_missing_sections(self, resume_data): missing_information = [] for section in basic_informations: if not resume_data.get(section): missing_information.append(section) return missing_information def segregate_sections(self, text): header_pattern = re.compile(rf'^\s*({"|".join(re.escape(header) for header in section_headers)}):?\s*$', re.IGNORECASE) sections_text = {} current_section = None lines = text.splitlines() for line in lines: clean_line = line.strip() match = header_pattern.match(clean_line) if match: current_section = match.group(1).upper() sections_text[current_section] = [] elif current_section: sections_text[current_section].append(line.strip()) return sections_text def extract_and_format_sections(self, sections_text, Extract_sections): formatted_text = "" for section in Extract_sections: if section in sections_text: section_content = " ".join(sections_text[section]).replace('\n', ' ') formatted_text += f"{section}:\n{section_content}\n\n" return formatted_text def replace_keywords_with_placeholders(self, formatted_text, found_keyword_section): placeholder_text = formatted_text keyword_placeholders = {} # Use a set to avoid duplicates and keep track of keyword placeholders used_keywords = set() for i, keyword in enumerate(found_keyword_section): if keyword not in used_keywords: used_keywords.add(keyword) placeholder = f"{{KEYWORD_{i}}}" keyword_placeholders[placeholder] = keyword # Using word boundary to match whole words placeholder_text = re.sub(r'\b' + re.escape(keyword) + r'\b', placeholder, placeholder_text, flags=re.IGNORECASE) return placeholder_text, keyword_placeholders def replace_placeholders_with_keywords(self, grammar_issues, keyword_placeholders): updated_issues = [] for issue in grammar_issues: context = issue['context'] for placeholder, keyword in keyword_placeholders.items(): context = context.replace(placeholder, keyword) # Update the context in the issue dictionary issue['context'] = context updated_issues.append(issue) return updated_issues def grammar_check(self, placeholder_text): matches = tool.check(placeholder_text) grammar_issues = [] for match in matches: issue = { "context": match.context, "error": match.message, "rule_id": match.ruleId, "suggested_correction": match.replacements } grammar_issues.append(issue) return grammar_issues def filter_grammar_issues(self, grammar_issues, ignore_rule_ids=None, ignore_error_keywords=None): if ignore_rule_ids is None: ignore_rule_ids = [] if ignore_error_keywords is None: ignore_error_keywords = [] filtered_issues = [] for issue in grammar_issues: if issue['rule_id'] not in ignore_rule_ids and not any(keyword in issue['error'] for keyword in ignore_error_keywords): filtered_issues.append(issue) return filtered_issues def process_resume(self, text, found_keyword_section, Extract_sections): sections_text = self.segregate_sections(text) formatted_text = self.extract_and_format_sections(sections_text, Extract_sections) found_keyword_section = self.extract_keyword_variations_from_formatted_text(formatted_text) placeholder_text, keyword_placeholders = self.replace_keywords_with_placeholders(formatted_text, found_keyword_section) grammar_issues = self.grammar_check(placeholder_text) grammar_issues_text = self.replace_placeholders_with_keywords(grammar_issues, keyword_placeholders) filtered_grammar_issues = self.filter_grammar_issues(grammar_issues, ignore_rule_ids, ignore_error_keywords) return filtered_grammar_issues def grammar_issue_check(self, text, found_keyword_section, Extract_sections): issues = {} text1 = " ".join(text.split("\n")) for section in Extract_sections: grammar_issues = self.process_resume(text, found_keyword_section, [section]) if not grammar_issues: grammar_issues = "no error found" issues[section] = grammar_issues return issues def normalize_font_name(self,font_name): if '-' in font_name: font_name = font_name.split('-')[0] if '+' in font_name: font_name = font_name.split('+')[1] return font_name def extract_text_properties(self, pdf_path, predefined_terms): text_properties = [] current_phrase = "" current_font_size = None current_font_name = None current_page_num = None special_characters = set("●▪•!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~") def add_current_phrase(): nonlocal current_phrase if current_phrase.strip(): flag = any(current_phrase in term for term in predefined_terms) if not flag: text_properties.append({ "text": current_phrase, "font_size": current_font_size, "font_name": current_font_name, "page_num": current_page_num }) current_phrase = "" for page_layout in extract_pages(pdf_path): for element in page_layout: if isinstance(element, LTTextContainer): for text_line in element: if isinstance(text_line, LTTextLineHorizontal): for character in text_line: if isinstance(character, LTChar): text = character.get_text() font_size = round(character.size, 2) font_name = self.normalize_font_name(character.fontname) page_num = page_layout.pageid if text.isspace() or text in special_characters: add_current_phrase() continue if (font_size != current_font_size or font_name != current_font_name or page_num != current_page_num): add_current_phrase() current_font_size = font_size current_font_name = font_name current_page_num = page_num current_phrase += text add_current_phrase() return text_properties def group_similar_fonts(self,text_properties, tolerance=0.5): grouped_properties = defaultdict(list) for prop in text_properties: rounded_size = round(prop["font_size"] / tolerance) * tolerance key = (prop["font_name"], rounded_size) grouped_properties[key].append(prop) return grouped_properties def identify_different_fonts_and_sizes(self, grouped_properties): most_common_group = max(grouped_properties.values(), key=len) most_common_key = None for key, group in grouped_properties.items(): if group == most_common_group: most_common_key = key break different_texts = [] for key, group in grouped_properties.items(): if group != most_common_group: for prop in group: reason = [] if key[1] != most_common_key[1]: reason.append(f"size not {most_common_key[1]}") if key[0] != most_common_key[0]: reason.append(f"font not {most_common_key[0]}") different_texts.append({ "page_num": prop['page_num'], "text": prop['text'], "found_size": prop['font_size'], "found_font_name": prop['font_name'], "reason": ", ".join(reason) }) return different_texts def parse_dates(self, sections_text, section_name): # Check if the section is in the text suggest = "" # Define the date patterns to match various date formats date_pattern = ( r'\b\d{1,2}/\d{4}\b|' # MM/YYYY r'\b(?:jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|jul(?:y)?|aug(?:ust)?|sep(?:tember)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?)\s+\d{4}\b|' # Month YYYY r'\b(?:jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|jul(?:y)?|aug(?:ust)?|sep(?:tember)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?)\s+\d{1,2},?\s*\d{4}\b|' # Month DD, YYYY r'\b\d{4}\b|' # YYYY r'\b(?:jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|jul(?:y)?|aug(?:ust)?|sep(?:tember)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?)[a-z]*/?\d{4}\b|' # Month/YYYY r'\b(?:jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|jul(?:y)?|aug(?:ust)?|sep(?:tember)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?)[a-z]*\d{4}\s*-\s*(?:jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|jul(?:y)?|aug(?:ust)?|sep(?:tember)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?)[a-z]*\d{4}\b' # Month/YYYY - Month/YYYY ) all_dates = [] # Iterate over the entries in the section_name for entry in sections_text[section_name]: entry = entry.lower() matches = re.findall(date_pattern, entry) if matches and len(matches)>1: if len(matches) == 2: all_dates.append(f"{matches[0]} {matches[1]}") else: all_dates.extend(matches) return all_dates def convert_to_date(self, date_str): # Mapping of month names and abbreviations to their numeric equivalents month_map = { 'jan': 1, 'january': 1, 'feb': 2, 'february': 2, 'mar': 3, 'march': 3, 'apr': 4, 'april': 4, 'may': 5, 'jun': 6, 'june': 6, 'jul': 7, 'july': 7, 'aug': 8, 'august': 8, 'sep': 9, 'september': 9, 'oct': 10, 'october': 10, 'nov': 11, 'november': 11, 'dec': 12, 'december': 12, '01': 1, '02': 2, '03': 3, '04': 4, '05': 5, '06': 6, '07': 7, '08': 8, '09': 9, '10': 10, '11': 11, '12': 12 } # Regex patterns to match different date formats pattern_mm_yyyy = re.compile(r'(\d{1,2})/(\d{4})') pattern_mm_yyyy_space = re.compile(r'(\d{1,2})\s(\d{4})') pattern_month_yyyy = re.compile(r'([a-zA-Z]+)\s?(\d{4})') pattern_yyyy = re.compile(r'(\d{4})') def extract_date(date_str): match_mm_yyyy = pattern_mm_yyyy.match(date_str) match_mm_yyyy_space = pattern_mm_yyyy_space.match(date_str) match_month_yyyy = pattern_month_yyyy.match(date_str) match_yyyy = pattern_yyyy.match(date_str) if match_mm_yyyy: month = int(match_mm_yyyy.group(1)) year = int(match_mm_yyyy.group(2)) elif match_mm_yyyy_space: month = int(match_mm_yyyy_space.group(1)) year = int(match_mm_yyyy_space.group(2)) elif match_month_yyyy: month = month_map.get(match_month_yyyy.group(1).lower()) year = int(match_month_yyyy.group(2)) elif match_yyyy: month = 1 year = int(match_yyyy.group(1)) else: return [] return datetime.date(year, month, 1) date_parts = re.findall(r'(\d{4}\s[a-zA-Z]+\s?|\d{4}[a-zA-Z]+|\d{4}\/\d{2}|\d{4}\s\d{2}|[a-zA-Z]+\s?\d{4}|\d{4}\s[a-zA-Z]+)', date_str) if len(date_parts) == 1: # Standalone year or single date start_date = extract_date(date_parts[0]) end_date = datetime.date(start_date.year, start_date.month, start_date.day) elif len(date_parts) == 2: # Date range start_date = extract_date(date_parts[0]) end_date = extract_date(date_parts[1]) else: return [] return start_date, end_date def date_time(self, date_parts): converted_dates = [] for date_part in date_parts: start_date, end_date = self.convert_to_date(date_part) converted_dates.append((start_date, end_date)) return converted_dates def check_chronological_order(self, converted_dates, section_name ): suggestion = "" sorted_dates = sorted(converted_dates, key=lambda x: (x[1], x[0]), reverse=True) if converted_dates == sorted_dates: suggestion = f"{section_name} section is in chronological order." else: suggestion = f"{section_name} section is not in chronological order." return suggestion def check_common_projects(self, projects_text): found_projects = [] for project in common_projects: if project.lower() in projects_text.lower(): found_projects.append(project) return found_projects def recommend_resources(): # Randomly pick 2 blog articles and 2 YouTube links recommended_blogs = random.sample(blog_articles, 2) recommended_youtube = random.sample(youtube_links, 2) # Return the recommendations return { "Recommended Blogs": recommended_blogs, "Recommended YouTube Links": recommended_youtube } def check_imarticus_certifications(self, certifications_text): # Check if "imarticus" is present in the certifications text if "imarticus" in certifications_text.lower(): return { "found": True, "message": "Imarticus certification found. Please upload it in the academic section." } return { "found": False, "message": "No Imarticus certification found in the provided text." } def chronological_order_check(self, sections_text, section_name): order_suggestion = "" suggestion = "" section_name = section_name.upper() if section_name in sections_text: date = self.parse_dates(sections_text, section_name) if date: converted_dates = self.date_time(date) order_suggestion = self.check_chronological_order(converted_dates, section_name) else: suggestion = f"No valid dates found in {section_name} section. " else: suggestion = f"{section_name} is not in section header. " return order_suggestion, suggestion # Function to check for spelling mistakes def check_spelling(self, headers, section_headers): suggestions = [] for header in headers: if header.upper() not in map(str.upper, section_headers): suggestions = header return suggestions def is_present_name(name): """ Checks if a given name has at least 2 words. Args: name: The name string to check. Returns: True if it has at least 2 words, false otherwise. """ parts = name.split() return len(parts) >= 2 def is_sentence_case(name): parts = name.split() # Split into individual words for part in parts: if not part: # handles empty strings in name continue if not part[0].isupper() or not part[1:].islower(): return False # Check if first letter is uppercase and rest are lowercase return True def is_present_name(self,name): parts = name.split() return len(parts) >= 2 def is_sentence_case(self,name): parts = name.split() for part in parts: if not part: continue if not part[0].isupper() or not part[1:].islower(): return False return True def extract_project_links(self,sections_text): project_links = {} if "PROJECTS" in sections_text: project_list = sections_text.get("PROJECTS", []) url_pattern = r"https?://[^\s]+" for project in project_list: links = re.findall(url_pattern,project) if links: project_links[project] = links return project_links def count_sentences(self,text): sentence_endings = r"(?4: return 1 else: return 0 def calculate_extra_urls_bonus(self,pdf_path): domains = [ r"hackerrank\.com", # Hackerrank r"leetcode\.com", # LeetCode r"medium\.com" # Medium ] extra_urls = self.extract_extra_urls_pdf(pdf_path, domains) has_extra_urls = any(urls for urls in extra_urls.values()) return 5 if has_extra_urls else 0 def calculate_relevant_experience_score(self, experience_text): """ Assigns a score based on the presence of relevant experience keywords. Args: experience_text (str): The extracted work experience section text. Returns: int: A score of 5 if relevant keywords are found, otherwise 0. """ if not experience_text: return 0 # ✅ No experience section → Score 0 if isinstance(experience_text, list): experience_text = " ".join(experience_text) # ✅ Convert list to a single string experience_text = experience_text.strip().lower() # ✅ Ensure it's a string and lowercase # ✅ Check if any keyword from 'data_science_skills' or 'essential_skills' exists for skill in data_science_skills + essential_skills: if skill.lower() in experience_text: return 5 # ✅ Found relevant experience → Full score return 0 def calculate_ds_skills_score(self, skills_present): if not skills_present: # No skills found at all return 0 # Use skills from config instead of hardcoded list ds_skills_list_lower = [skill.lower() for skill in data_science_skills] skills_present_lower = [skill.lower() for skill in skills_present] matching_count = sum(1 for skill in skills_present_lower if skill in ds_skills_list_lower) if matching_count == 0: # Skills found but none match DS list return 2 elif 1 <= matching_count <= 5: return 3 elif matching_count > 5: return 5 return 0 def calculate_project_link_score(self, projects_with_links): """ Assigns a score based on whether project links are present. Args: projects_with_links (int): The number of projects with links. Returns: int: 2 if project links are found, otherwise 0. """ return 2 if projects_with_links > 0 else 0 def imarticus_review_score(self,name,contact_number,email,linkedin_urls,github_url,missing_sections,sections_not_capitalized,common_projects,section_order_suggestion,sections_text,skills,relevant_experience_score): score = 0 if name: name_parts = name.split() num_parts = len(name_parts) if num_parts == 0: score += 0 if self.is_sentence_case(name): score += 3 elif self.is_present_name(name): score += 1.5 if contact_number and isinstance(contact_number, str): digits_only = re.sub(r'\D', '', contact_number) if digits_only.startswith("91") and len(digits_only) > 10: digits_only = digits_only[2:] # Remove the first two characters ('91') if len(digits_only) == 10 and digits_only[0] in "6789": # Check for valid Indian mobile numbers score += 3 if email: score += 3 if self.is_valid_email(email) else 0 score += 3 if linkedin_urls else 0 if github_url: github_suggestion = self.is_valid_url(github_url) score += 3 if not github_suggestion else 0 else: score += 0 if len(missing_sections)==0 and len(sections_not_capitalized)==0: score+=10 elif len(missing_sections)==0 and len(sections_not_capitalized)>0: score+=8 elif len(missing_sections)<=3: score+=6 elif len(missing_sections)>4: score+=3 if common_projects: score +=0 else: score +=5 if section_order_suggestion: score -= 2 else: score """ ds_skills_list_lower = [skill.lower() for skill in data_science_skills] skills_present_lower = [skill.lower() for skill in self.extract_skills_from_resume(skills) ] matching_skill_count = 0 for skill in skills_present_lower: if ds_skills_list_lower: matching_skill_count+=1 if matching_skill_count==0: score+=0 if matching_skill_count<=5: score+=2 elif matching_skill_count>=10 and matching_skill_count<=15: score+5 else: score+=8 """ if "PROJECTS" not in sections_text: score+=0 else: project_list = sections_text.get("PROJECTS",[]) project_count = len([x for x in project_list if "Description" in x]) if project_count<=2: score+=2 elif project_count>2 and project_count<=4: score+=5 elif project_count>4: score+=3 """ project_links = self.extract_project_links(sections_text) total_projects = len(sections_text.get("PROJECTS", [])) projects_with_links = len(project_links) if total_projects > 0: if projects_with_links == 0: score+=0 elif projects_with_links / total_projects >= 0.5: score += 1.5 if projects_with_links == total_projects: score += 3 """ resume_data = {} # Extract projects & links project_links = self.extract_project_links(sections_text) projects_with_links = len(project_links) # ✅ Count only projects with descriptions valid_projects = [ p for p in sections_text.get("PROJECTS", []) if "description" in p.lower() ] total_projects = len(valid_projects) # ✅ Count projects properly # ✅ Calculate project link score project_link_score = self.calculate_project_link_score(projects_with_links) resume_data["project_link_score"] = project_link_score # ✅ Prevent division by zero if total_projects > 0: if projects_with_links == 0: score += 0 elif projects_with_links / total_projects >= 0.5: score += 1.5 if projects_with_links == total_projects: score += 3 else: score += 0 # ✅ Ensure no division error if no projects exist """ profile_summary = sections_text.get("PROFILE SUMMARY", "") print(profile_summary) summary_score = self.calculate_summary_score(profile_summary) score += summary_score """ ds_skills_score = self.calculate_ds_skills_score(skills) score += ds_skills_score certifications = sections_text.get("CERTIFICATIONS & ACADEMIC ENDEAVOURS", []) num_certifications = len(certifications) if num_certifications==0: score+=0 elif 0 < num_certifications <= 2: score+=3 elif 2 < num_certifications <= 4: score+=5 elif num_certifications>4: score+=7 """ extra_urls_bonus = self.calculate_extra_urls_bonus(pdf_path) score += extra_urls_bonus """ score += relevant_experience_score score += project_link_score return score def imarticus_detailed_score(self, name, contact_number, email, linkedin_urls, github_url, missing_sections=None, sections_not_capitalized=None, common_projects=None, section_order_suggestion=None, sections_text=None, skills=None, relevant_experience_score=0): # Ensure lists and dictionaries have default values to avoid 'NoneType' errors missing_sections = missing_sections or [] sections_not_capitalized = sections_not_capitalized or [] common_projects = common_projects or [] sections_text = sections_text or {} score_breakdown = { "name_score": 0, "contact_number_score": 0, "email_score": 0, "linkedin_url_score": 0, "github_url_score": 0, "missing_sections_score": 0, "common_projects_score": 0, "section_order_score": 0, "projects_score": 0, "certifications_score": 0, "relevant_experience_score": 0, "ds_skills_score": 0, "extra_urls_bonus": 0, "summary_score": 0, "project_link_score": 0 } # ✅ Name Score (3 Points) if name: if self.is_sentence_case(name): score_breakdown["name_score"] = 3 elif self.is_present_name(name): score_breakdown["name_score"] = 1.5 # ✅ Contact Number Score (3 Points) if contact_number and isinstance(contact_number, str): digits_only = re.sub(r'\D', '', contact_number) if digits_only.startswith("91") and len(digits_only) > 10: digits_only = digits_only[2:] if len(digits_only) == 10 and digits_only[0] in "6789": score_breakdown["contact_number_score"] = 3 # ✅ Email Score (3 Points) score_breakdown["email_score"] = 3 if email and self.is_valid_email(email) else 0 # ✅ LinkedIn URL Score (3 Points) score_breakdown["linkedin_url_score"] = 3 if linkedin_urls else 0 # ✅ GitHub URL Score (3 Points) if github_url and self.is_valid_url(github_url): score_breakdown["github_url_score"] = 3 # ✅ Missing Sections Score (10 Points) if not missing_sections and not sections_not_capitalized: score_breakdown["missing_sections_score"] = 10 elif not missing_sections and sections_not_capitalized: score_breakdown["missing_sections_score"] = 8 elif len(missing_sections) <= 3: score_breakdown["missing_sections_score"] = 6 else: score_breakdown["missing_sections_score"] = 3 # ✅ Common Projects Score (5 Points) score_breakdown["common_projects_score"] = 0 if common_projects else 5 # ✅ Section Order Score (2 Points) score_breakdown["section_order_score"] = -2 if section_order_suggestion else 0 # ✅ Projects Score (5 Points) if "PROJECTS" in sections_text: project_list = sections_text.get("PROJECTS", []) project_count = len([x for x in project_list if "Description" in x]) if project_count <= 2: score_breakdown["projects_score"] = 2 elif 2 < project_count <= 4: score_breakdown["projects_score"] = 5 else: score_breakdown["projects_score"] = 3 # ✅ Certifications Score (7 Points) certifications = sections_text.get("CERTIFICATIONS & ACADEMIC ENDEAVOURS", []) num_certifications = len(certifications) if num_certifications == 0: score_breakdown["certifications_score"] = 0 elif 0 < num_certifications <= 2: score_breakdown["certifications_score"] = 3 elif 2 < num_certifications <= 4: score_breakdown["certifications_score"] = 5 else: score_breakdown["certifications_score"] = 7 # ✅ Relevant Experience Score (5 Points) score_breakdown["relevant_experience_score"] = relevant_experience_score if relevant_experience_score is not None else 0 # ✅ Data Science Skills Score (5 Points) score_breakdown["ds_skills_score"] = self.calculate_ds_skills_score(skills) # ✅ Extra URLs Bonus (5 Points) score_breakdown["extra_urls_bonus"] = self.calculate_extra_urls_bonus(sections_text) # ✅ Summary Score (5 Points) profile_summary = sections_text.get("PROFILE SUMMARY", "") score_breakdown["summary_score"] = self.calculate_summary_score(profile_summary) # ✅ Project Link Score (2 Points) project_links = self.extract_project_links(sections_text) projects_with_links = len(project_links) score_breakdown["project_link_score"] = self.calculate_project_link_score(projects_with_links) return score_breakdown def calculate_name_score(self,name): if not name: return 0 name_parts = name.split() num_parts = len(name_parts) if num_parts == 0: return 0 elif self.is_sentence_case(name): return 3 elif self.is_present_name(name): return 1.5 else: return 0 def calculate_contact(self,contact_number): if contact_number and isinstance(contact_number, str): digits_only = re.sub(r'\D', '', contact_number) if digits_only.startswith("91") and len(digits_only) > 10: digits_only = digits_only[2:] # Remove the first two characters ('91') if len(digits_only) == 10 and digits_only[0] in "6789": # Check for valid Indian mobile numbers return 3 else: return 0 def calculate_email(self,email): if email: if self.is_valid_email(email): return 3 else: return 0 def calculate_github_url_score(self,github_url): if github_url: github_suggestion = self.is_valid_url(github_url) return 3 if not github_suggestion else 0 return 0 def parse_text(self, path): logger = logging.getLogger(__name__) logging.getLogger("pdfminer").setLevel(logging.WARNING) resume_data = {} logger.debug('parsing text') text = self.extract_text_from_pdf(path) text1 = " ".join(text.split("\n")) skills_found = self.extract_skills_from_resume(text) found_keywords = self.extract_keyword_variations_from_resume(text) sections_text = self.segregate_sections(text) formatted_text = self.extract_and_format_sections(sections_text, Extract_sections) found_keyword_section = self.extract_keyword_variations_from_formatted_text(formatted_text) parsed_sections = self.segregate_sections(text) projects = parsed_sections.get("PROJECTS", []) certifications = parsed_sections.get("CERTIFICATIONS & ACADEMIC ENDEAVOURS", []) projects_text = "\n".join(projects) certifications_text = "\n".join(certifications) found_imarticus_certification = self.check_imarticus_certifications(certifications_text) found_projects = self.check_common_projects(projects_text) name, name_suggestion = self.extract_name(text) contact_number, contact_suggestion = self.extract_contact_number_from_resume(text) email, email_suggestion = self.extract_email_from_resume(path) github_urls = self.extract_github_urls_from_pdf(path) github_urls_suggestions = self.is_valid_url(github_urls) linkedin_urls = self.extract_linkedIn_urls_from_pdf(path) section_by_grammer_issues = self.grammar_issue_check(text, found_keyword_section, Extract_sections) domains = [ r"hackerrank\.com", # Hackerrank r"leetcode\.com", # LeetCode r"medium\.com" # Medium ] extra_urls = self.extract_extra_urls_pdf(path, domains) education_order_suggestion, education_suggestion = self.chronological_order_check(sections_text, "ACADEMIC PROFILE") experience_order_suggestion, experience_suggestion = self.chronological_order_check(sections_text, "WORK EXPERIENCE") headers = list(sections_text.keys()) spelling_suggestions = self.check_spelling(headers, section_headers) predefined_terms = [name, email] predefined_terms.extend(required_sections) text_properties = self.extract_text_properties(path, predefined_terms) grouped_properties = self.group_similar_fonts(text_properties) different_texts = self.identify_different_fonts_and_sizes(grouped_properties) font_suggestions = [] for item in different_texts: font_suggestion = f"Formatting issue at Page: {item['page_num']}, Text: {item['text']}, Reason: {item['reason']}, Found font size: {item['found_size']}, Found font name: {item['found_font_name']}" font_suggestions.append(font_suggestion) missing_sections, sections_not_capitalized = self.extract_sections_from_resume(text) linkedin_urls_suggestion = str() common_project = str() if not name: name_suggestion = "Please add name to the resume." if not contact_number: contact_suggestion = "Please add the contact number to the resume." if not email: email_suggestion = "Please add the email address to the resume." if not github_urls: github_urls_suggestions = "Add the github_urls to the resume." if not linkedin_urls: linkedin_urls_suggestion = "Add the linkedin_urls to the resume." if found_projects: common_project = "Common projects found in Projects section: " for project in found_projects: common_project += project # Replace the existing project length suggestion code with: project_list = sections_text.get("PROJECTS", []) projects_with_description = [ p for p in project_list if "description" in p.lower() ] project_count = len(projects_with_description) if project_count == 0: project_length_suggestion = "No projects found. Consider at least 2 projects." elif project_count == 1: project_length_suggestion = "Only 1 project found. Consider adding 1 more project." else: project_length_suggestion = f"{project_count} projects found." # Store in resume data (keeps your existing URL extraction) resume_data["project_length_suggestion"] = project_length_suggestion experience_text = sections_text.get("WORK EXPERIENCE", "") # ✅ Extract work experience section relevant_experience_score = self.calculate_relevant_experience_score(experience_text) # ✅ Calculate score # ✅ Store in the final resume data output resume_data["relevant_experience_score"] = relevant_experience_score section_grammar_check_issues = self.grammar_check(sections_text.keys()) recommended_blogs = random.sample(blog_articles, 2) recommended_youtube = random.sample(youtube_links, 2) name_score = self.calculate_name_score(name) contact_score = self.calculate_contact(contact_number) email_score = self.calculate_email(email) github_url_score = self.calculate_github_url_score(github_urls) # Calculate imarticus_score imarticus_score = self.imarticus_review_score( name, contact_number, email, linkedin_urls, github_urls, missing_sections, sections_not_capitalized, common_projects=found_projects, # Ensure to pass found projects section_order_suggestion=experience_order_suggestion, sections_text=sections_text, skills=skills_found, relevant_experience_score=relevant_experience_score, #pdf_path=path #relevant_keywords_found=bool(found_keywords), # Convert to boolean #experience_orderly_arranged=experience_order_suggestion, # Pass orderly arrangement check #experience_section_present="WORK EXPERIENCE" in sections_text # Check if experience section is present ) # Populate resume data dictionary resume_data = { "name": name, "contact_number": contact_number, "email": email, "linkedin_urls": linkedin_urls, "experience_order_suggestion": experience_order_suggestion, "education_order_suggestion": education_order_suggestion, "grammer_issues_by_section": section_by_grammer_issues, "github_urls": github_urls, "skills": skills_found, "spelling_suggestions": spelling_suggestions, "found_keywords": found_keywords, "text": text, "font_suggestions": font_suggestions, "name_suggestion": name_suggestion, "contact_suggestion": contact_suggestion, "email_suggestion": email_suggestion, "github_urls_suggestions": github_urls_suggestions, "linkedin_urls_suggestion": "Add the LinkedIn URLs to the resume." if not linkedin_urls else "", "missing_sections": missing_sections, "common_projects": "Common projects found in Projects section: " + ", ".join(found_projects) if found_projects else "", "project_length_suggestion": project_length_suggestion, "section_grammar_check_issues": section_grammar_check_issues, "imarticus_score": imarticus_score, # Add the score to resume data "extra_urls": extra_urls, "certifications": { "found": found_imarticus_certification["found"], "message": found_imarticus_certification["message"], "text": certifications_text # Store extracted certification text }, "recommended_blogs": recommended_blogs, "recommended_youtube_links": recommended_youtube, "name_score":name_score, "contact_score":contact_score, "email_score":email_score, "github_urls_score":github_url_score } # Additional checks and data additions if "WORK EXPERIENCE" in sections_text.keys() and "WORK EXPERIENCE" != list(sections_text.keys())[2]: section_order_suggestion = f"WORK EXPERIENCE should come before {list(sections_text.keys())[2]}" resume_data["section_order_suggestion"] = section_order_suggestion missing_important_sections = self.check_missing_sections(resume_data) resume_data["basic_information_section"] = missing_important_sections or "Basic information is Found" missing_skills = list(set(essential_skills) - set(skills_found)) resume_data["missing_skills"] = missing_skills found_keywords_count = len(resume_data["found_keywords"]) num_keywords = len(keyword_variations) quality_mapping = {"Low": 0.2, "Medium": 0.5, "High": 0.8} # Assuming some quality mapping for quality, threshold in quality_mapping.items(): if found_keywords_count < num_keywords * threshold: resume_data["quality"] = quality break found_certification = "Imarticus certification found in Certifications section." if found_imarticus_certification else "No Imarticus certification found in Certifications section." resume_data["found_certification"] = found_certification # Experience relevance check Extract_exp_sections = ['WORK EXPERIENCE'] experience_text = self.extract_and_format_sections(sections_text, Extract_exp_sections) if experience_text: resume_data["work_experience_check"] = "Experience is relevant to Data science." if any(variation.lower() in experience_text.lower() for keyword, variations in keyword_variations.items() for variation in variations) else "Experience is not relevant to Data science." return jsonify(resume_data)