diff --git "a/src/resume_parser.py" "b/src/resume_parser.py" --- "a/src/resume_parser.py" +++ "b/src/resume_parser.py" @@ -1,1208 +1,1208 @@ -#total score = 67 -# python file to parse different section from resume -from pdfminer.high_level import extract_pages, extract_text -from pdfminer.layout import LTTextContainer, LTChar, LTTextLineHorizontal -from collections import defaultdict -from flask import jsonify -import re, fitz, requests, logging, datetime -import src.config as config -from .config import data_science_skills, keyword_variations, essential_skills, quality_mapping, Extract_sections, suggested_projects, ignore_rule_ids -from .config import required_sections, linkedin_domain, github_domain, basic_informations, section_headers, common_projects, ignore_error_keywords,blog_articles,youtube_links -from .config import kaggle_domain,hackerrank_domain,leetcode_domain,medium_domain -from spacy.matcher import Matcher -import language_tool_python -from collections import defaultdict -import random -tool = language_tool_python.LanguageTool('en-US') - - - -class ResumeParser: - - def extract_contact_number_from_resume(self, text): - contact_number = None - suggestion = "" - - # Use regex pattern to find a potential contact number - pattern = r"\b(?:\+?\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b" - match = re.search(pattern, text) - if match: - contact_number = match.group() - # Check if the contact number is of the correct length - digits_only = re.sub(r'\D', '', contact_number) - if len(digits_only) == 10: - suggestion = "" - elif len(digits_only) > 10 and digits_only.startswith('91') and len(digits_only[2:]) == 10: - suggestion = "" - else: - suggestion = "Contact number should have exactly 10 digits." - - return contact_number, suggestion - - - - def extract_hyperlinks(self, pdf_path): - doc = fitz.open(pdf_path) - links = [] - - for page_num in range(len(doc)): - page = doc.load_page(page_num) - link_list = page.get_links() - for link in link_list: - uri = link.get('uri', None) - if uri: - links.append(uri) - - return links - - def extract_text_from_pdf(self, pdf_path): - return extract_text(pdf_path) - - def extract_email_from_text(self, text): - pattern = r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b" - match = re.search(pattern, text) - if match: - return match.group() - return None - - def extract_email_from_resume(self, pdf_path): - text = self.extract_text_from_pdf(pdf_path) - email = self.extract_email_from_text(text) - suggestion = "" - - # If no email found in text, check hyperlinks - if not email: - links = self.extract_hyperlinks(pdf_path) - for link in links: - if link.startswith('mailto:'): - email_candidate = link.split('mailto:')[1] - if self.is_valid_email(email_candidate): - email = email_candidate - break - - # Additional validation for email found in text or links - if email and not self.is_valid_email(email): - suggestion += "Your email address doesn't seem to be valid. Please check and correct." - - return email, suggestion - - - def is_valid_email(self, email): - # Length check - if len(email) > 254: - return False - - # Consecutive special characters check - if re.search(r"[._%+-]{2,}", email): - return False - - # Domain part validation - domain_part = email.split('@')[1] - if not re.match(r"[A-Za-z0-9.-]+\.[A-Za-z]{2,}", domain_part): - return False - - # Standard email format check - pattern = r"^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$" - return re.match(pattern, email) is not None - - - def extract_sections_from_resume(self, text): - missing_sections = [] - sections_not_capitalized = [] - - for section in required_sections: - pattern = r"\b{}\b".format(re.escape(section)) - - match_obj = re.search(pattern, text, re.IGNORECASE) - if not match_obj: - missing_sections.append(section) - else: - if match_obj.group() not in map(str.upper, required_sections): - sections_not_capitalized.append(section) - - return missing_sections, sections_not_capitalized - - def extract_skills_from_resume(self, text): - if not isinstance(text, str): - raise ValueError(f"Expected 'text' to be a string, but got {type(text)}") - - skills = [] - for skill in essential_skills: - pattern = r"\b{}\b".format(re.escape(skill)) - match = re.search(pattern, text, re.IGNORECASE) - if match: - skills.append(skill) - return skills - - def extract_keyword_variations_from_resume(self, text): - found_keywords = [] - for keyword, variations in keyword_variations.items(): - for variation in variations: - if variation.lower() in text.lower(): - found_keywords.append(variation) - break - - return found_keywords - - def extract_keyword_variations_from_formatted_text(self, formatted_text): - found_keyword_section = [] - for keyword, variations in keyword_variations.items(): - for variation in variations: - if variation.lower() in formatted_text.lower(): - found_keyword_section.append(variation) - break - - return found_keyword_section - - def extract_linkedIn_urls_from_pdf(self, pdf_path): - linkedin_urls = None - pdf_document = fitz.open(pdf_path) - for page_num in range(len(pdf_document)): - page = pdf_document.load_page(page_num) - links = page.get_links() - for link in links: - url = link.get('uri', '') - if re.search(linkedin_domain, url): - linkedin_urls = url - pdf_document.close() - return linkedin_urls - - def extract_github_urls_from_pdf(self, pdf_path): - github_urls = None - pdf_document = fitz.open(pdf_path) - for page_num in range(len(pdf_document)): - page = pdf_document.load_page(page_num) - links = page.get_links() - for link in links: - url = link.get('uri', '') - if re.search(github_domain, url): - path = re.sub(github_domain, '', url) - parts = path.split('/') - if len(parts) == 1: - github_urls = url - pdf_document.close() - return github_urls - - - def extract_extra_urls_pdf(self,pdf_path, domains): - extracted_urls = defaultdict(set) - try: - # Open the PDF document - pdf_document = fitz.open(pdf_path) - - # Iterate through all pages in the PDF - for page_num in range(len(pdf_document)): - page = pdf_document.load_page(page_num) - links = page.get_links() - - for link in links: - url = link.get('uri', '') - if url: # Ensure there's a URL - for domain in domains: - if re.search(domain, url, re.IGNORECASE): - extracted_urls[domain].add(url) # Add URL to the domain's set - except Exception as e: - print(f"Error processing PDF: {e}") - finally: - pdf_document.close() - - return {domain: list(urls) for domain, urls in extracted_urls.items()} - - def is_valid_url(self , github_urls ): - suggest = "" - for _ in [github_urls]: - if not github_urls: - break - - try: - response = requests.head(github_urls) - if response.status_code != 200: - suggest = "GitHub URL is not valid, please check and correct. " - except requests.RequestException: - suggest = "GitHub URL is not valid, please check and correct. " - - return suggest - return suggest - - - def is_valid_name(self, name): - if any(char.isdigit() for char in name): - return False - if len(name.split()) > 3: - return False - common_non_names = {"Email", "Github", "LinkedIn", "Portfolio", "Data Analyst"} - if name in common_non_names: - return False - return True - - def extract_name(self, resume_text): - - lines = resume_text.split('\n') - - # Use regex to find lines that likely contain names - name_lines = [line for line in lines if re.match(r'^[A-Za-z]*\s[A-Za-z]*', line.strip())] - - names = [] - for i in range(len(name_lines)): - if self.is_valid_name(name_lines[i].strip()): - names.append(name_lines[i].strip()) - - if len(names) >= 1: - name = names[0] - suggestion = "" - # Check if the name parts contain only alphabetic characters - name_parts = name.split() - if any(part[0].islower() for part in name_parts): - suggestion += " name should start with a capital letter. " - return name, suggestion - - return None, "No valid name found" - - - def check_missing_sections(self, resume_data): - missing_information = [] - for section in basic_informations: - if not resume_data.get(section): - missing_information.append(section) - return missing_information - - def segregate_sections(self, text): - header_pattern = re.compile(rf'^\s*({"|".join(re.escape(header) for header in section_headers)}):?\s*$', re.IGNORECASE) - sections_text = {} - current_section = None - lines = text.splitlines() - for line in lines: - clean_line = line.strip() - match = header_pattern.match(clean_line) - if match: - current_section = match.group(1).upper() - sections_text[current_section] = [] - elif current_section: - sections_text[current_section].append(line.strip()) - - return sections_text - - def extract_and_format_sections(self, sections_text, Extract_sections): - formatted_text = "" - for section in Extract_sections: - if section in sections_text: - section_content = " ".join(sections_text[section]).replace('\n', ' ') - formatted_text += f"{section}:\n{section_content}\n\n" - return formatted_text - - def replace_keywords_with_placeholders(self, formatted_text, found_keyword_section): - placeholder_text = formatted_text - keyword_placeholders = {} - - # Use a set to avoid duplicates and keep track of keyword placeholders - used_keywords = set() - for i, keyword in enumerate(found_keyword_section): - if keyword not in used_keywords: - used_keywords.add(keyword) - placeholder = f"{{KEYWORD_{i}}}" - keyword_placeholders[placeholder] = keyword - # Using word boundary to match whole words - placeholder_text = re.sub(r'\b' + re.escape(keyword) + r'\b', placeholder, placeholder_text, flags=re.IGNORECASE) - - return placeholder_text, keyword_placeholders - - def replace_placeholders_with_keywords(self, grammar_issues, keyword_placeholders): - updated_issues = [] - for issue in grammar_issues: - context = issue['context'] - for placeholder, keyword in keyword_placeholders.items(): - context = context.replace(placeholder, keyword) - # Update the context in the issue dictionary - issue['context'] = context - updated_issues.append(issue) - return updated_issues - - def grammar_check(self, placeholder_text): - matches = tool.check(placeholder_text) - grammar_issues = [] - for match in matches: - issue = { - "context": match.context, - "error": match.message, - "rule_id": match.ruleId, - "suggested_correction": match.replacements - } - grammar_issues.append(issue) - return grammar_issues - - def filter_grammar_issues(self, grammar_issues, ignore_rule_ids=None, ignore_error_keywords=None): - if ignore_rule_ids is None: - ignore_rule_ids = [] - if ignore_error_keywords is None: - ignore_error_keywords = [] - - filtered_issues = [] - for issue in grammar_issues: - if issue['rule_id'] not in ignore_rule_ids and not any(keyword in issue['error'] for keyword in ignore_error_keywords): - filtered_issues.append(issue) - - return filtered_issues - - def process_resume(self, text, found_keyword_section, Extract_sections): - sections_text = self.segregate_sections(text) - formatted_text = self.extract_and_format_sections(sections_text, Extract_sections) - found_keyword_section = self.extract_keyword_variations_from_formatted_text(formatted_text) - placeholder_text, keyword_placeholders = self.replace_keywords_with_placeholders(formatted_text, found_keyword_section) - grammar_issues = self.grammar_check(placeholder_text) - grammar_issues_text = self.replace_placeholders_with_keywords(grammar_issues, keyword_placeholders) - filtered_grammar_issues = self.filter_grammar_issues(grammar_issues, ignore_rule_ids, ignore_error_keywords) - return { - "grammar_issues": filtered_grammar_issues, - "spelling_errors": [issue for issue in filtered_grammar_issues if "SPELLING" in issue['rule_id']] - } - - def grammar_issue_check(self, text, found_keyword_section, Extract_sections): - issues = {} - text1 = " ".join(text.split("\n")) - for section in Extract_sections: - grammar_issues = self.process_resume(text, found_keyword_section, [section]) - if not grammar_issues: - grammar_issues = "no error found" - issues[section] = grammar_issues - return issues - - def normalize_font_name(self,font_name): - if '-' in font_name: - font_name = font_name.split('-')[0] - if '+' in font_name: - font_name = font_name.split('+')[1] - return font_name - - - def extract_text_properties(self, pdf_path, predefined_terms): - text_properties = [] - current_phrase = "" - current_font_size = None - current_font_name = None - current_page_num = None - - special_characters = set("●▪•!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~") - - def add_current_phrase(): - nonlocal current_phrase - if current_phrase.strip(): - flag = any(current_phrase in term for term in predefined_terms) - if not flag: - text_properties.append({ - "text": current_phrase, - "font_size": current_font_size, - "font_name": current_font_name, - "page_num": current_page_num - }) - current_phrase = "" - - for page_layout in extract_pages(pdf_path): - for element in page_layout: - if isinstance(element, LTTextContainer): - for text_line in element: - if isinstance(text_line, LTTextLineHorizontal): - for character in text_line: - if isinstance(character, LTChar): - text = character.get_text() - font_size = round(character.size, 2) - font_name = self.normalize_font_name(character.fontname) - page_num = page_layout.pageid - - if text.isspace() or text in special_characters: - add_current_phrase() - continue - - if (font_size != current_font_size or font_name != current_font_name or - page_num != current_page_num): - add_current_phrase() - current_font_size = font_size - current_font_name = font_name - current_page_num = page_num - - current_phrase += text - - add_current_phrase() - - return text_properties - - def group_similar_fonts(self,text_properties, tolerance=0.5): - grouped_properties = defaultdict(list) - - for prop in text_properties: - rounded_size = round(prop["font_size"] / tolerance) * tolerance - key = (prop["font_name"], rounded_size) - grouped_properties[key].append(prop) - - return grouped_properties - - - - - def identify_different_fonts_and_sizes(self, grouped_properties): - most_common_group = max(grouped_properties.values(), key=len) - most_common_key = None - for key, group in grouped_properties.items(): - if group == most_common_group: - most_common_key = key - break - - different_texts = [] - - for key, group in grouped_properties.items(): - if group != most_common_group: - for prop in group: - reason = [] - if key[1] != most_common_key[1]: - reason.append(f"size not {most_common_key[1]}") - if key[0] != most_common_key[0]: - reason.append(f"font not {most_common_key[0]}") - different_texts.append({ - "page_num": prop['page_num'], - "text": prop['text'], - "found_size": prop['font_size'], - "found_font_name": prop['font_name'], - "reason": ", ".join(reason) - }) - - return different_texts - - def parse_dates(self, sections_text, section_name): - # Check if the section is in the text - suggest = "" - - # Define the date patterns to match various date formats - date_pattern = ( - r'\b\d{1,2}/\d{4}\b|' # MM/YYYY - r'\b(?:jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|jul(?:y)?|aug(?:ust)?|sep(?:tember)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?)\s+\d{4}\b|' # Month YYYY - r'\b(?:jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|jul(?:y)?|aug(?:ust)?|sep(?:tember)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?)\s+\d{1,2},?\s*\d{4}\b|' # Month DD, YYYY - r'\b\d{4}\b|' # YYYY - r'\b(?:jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|jul(?:y)?|aug(?:ust)?|sep(?:tember)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?)[a-z]*/?\d{4}\b|' # Month/YYYY - r'\b(?:jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|jul(?:y)?|aug(?:ust)?|sep(?:tember)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?)[a-z]*\d{4}\s*-\s*(?:jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|jul(?:y)?|aug(?:ust)?|sep(?:tember)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?)[a-z]*\d{4}\b' # Month/YYYY - Month/YYYY - ) - - all_dates = [] - - # Iterate over the entries in the section_name - for entry in sections_text[section_name]: - entry = entry.lower() - matches = re.findall(date_pattern, entry) - if matches and len(matches)>1: - if len(matches) == 2: - all_dates.append(f"{matches[0]} {matches[1]}") - else: - all_dates.extend(matches) - - return all_dates - - - def convert_to_date(self, date_str): - # Mapping of month names and abbreviations to their numeric equivalents - month_map = { - 'jan': 1, 'january': 1, 'feb': 2, 'february': 2, - 'mar': 3, 'march': 3, 'apr': 4, 'april': 4, - 'may': 5, 'jun': 6, 'june': 6, 'jul': 7, - 'july': 7, 'aug': 8, 'august': 8, 'sep': 9, - 'september': 9, 'oct': 10, 'october': 10, - 'nov': 11, 'november': 11, 'dec': 12, 'december': 12, - '01': 1, '02': 2, '03': 3, '04': 4, - '05': 5, '06': 6, '07': 7, '08': 8, - '09': 9, '10': 10, '11': 11, '12': 12 - } - - # Regex patterns to match different date formats - pattern_mm_yyyy = re.compile(r'(\d{1,2})/(\d{4})') - pattern_mm_yyyy_space = re.compile(r'(\d{1,2})\s(\d{4})') - pattern_month_yyyy = re.compile(r'([a-zA-Z]+)\s?(\d{4})') - pattern_yyyy = re.compile(r'(\d{4})') - - def extract_date(date_str): - match_mm_yyyy = pattern_mm_yyyy.match(date_str) - match_mm_yyyy_space = pattern_mm_yyyy_space.match(date_str) - match_month_yyyy = pattern_month_yyyy.match(date_str) - match_yyyy = pattern_yyyy.match(date_str) - - if match_mm_yyyy: - month = int(match_mm_yyyy.group(1)) - year = int(match_mm_yyyy.group(2)) - elif match_mm_yyyy_space: - month = int(match_mm_yyyy_space.group(1)) - year = int(match_mm_yyyy_space.group(2)) - elif match_month_yyyy: - month = month_map.get(match_month_yyyy.group(1).lower()) - year = int(match_month_yyyy.group(2)) - elif match_yyyy: - month = 1 - year = int(match_yyyy.group(1)) - else: - return [] - - return datetime.date(year, month, 1) - - date_parts = re.findall(r'(\d{4}\s[a-zA-Z]+\s?|\d{4}[a-zA-Z]+|\d{4}\/\d{2}|\d{4}\s\d{2}|[a-zA-Z]+\s?\d{4}|\d{4}\s[a-zA-Z]+)', date_str) - if len(date_parts) == 1: - # Standalone year or single date - start_date = extract_date(date_parts[0]) - end_date = datetime.date(start_date.year, start_date.month, start_date.day) - elif len(date_parts) == 2: - # Date range - start_date = extract_date(date_parts[0]) - end_date = extract_date(date_parts[1]) - else: - return [] - - return start_date, end_date - - - def date_time(self, date_parts): - converted_dates = [] - for date_part in date_parts: - start_date, end_date = self.convert_to_date(date_part) - converted_dates.append((start_date, end_date)) - return converted_dates - - - def check_chronological_order(self, converted_dates, section_name ): - suggestion = "" - sorted_dates = sorted(converted_dates, key=lambda x: (x[1], x[0]), reverse=True) - if converted_dates == sorted_dates: - suggestion = f"{section_name} section is in chronological order." - else: - suggestion = f"{section_name} section is not in chronological order." - - return suggestion - - def check_common_projects(self, projects_text): - found_projects = [] - for project in common_projects: - if project.lower() in projects_text.lower(): - found_projects.append(project) - return found_projects - - def recommend_resources(): - # Randomly pick 2 blog articles and 2 YouTube links - recommended_blogs = random.sample(blog_articles, 2) - recommended_youtube = random.sample(youtube_links, 2) - - # Return the recommendations - return { - "Recommended Blogs": recommended_blogs, - "Recommended YouTube Links": recommended_youtube - } - - def check_imarticus_certifications(self, certifications_text): - # Check if "imarticus" is present in the certifications text - if "imarticus" in certifications_text.lower(): - return { - "found": True, - "message": "Imarticus certification found. Please upload it in the academic section." - } - return { - "found": False, - "message": "No Imarticus certification found in the provided text." - } - - - def chronological_order_check(self, sections_text, section_name): - order_suggestion = "" - suggestion = "" - section_name = section_name.upper() - if section_name in sections_text: - date = self.parse_dates(sections_text, section_name) - if date: - converted_dates = self.date_time(date) - order_suggestion = self.check_chronological_order(converted_dates, section_name) - else: - suggestion = f"No valid dates found in {section_name} section. " - else: - suggestion = f"{section_name} is not in section header. " - - return order_suggestion, suggestion - - - - # Function to check for spelling mistakes - def check_spelling(self, headers, section_headers): - suggestions = [] - for header in headers: - if header.upper() not in map(str.upper, section_headers): - suggestions = header - return suggestions - - def is_present_name(name): - """ - Checks if a given name has at least 2 words. - - Args: - name: The name string to check. - - Returns: - True if it has at least 2 words, false otherwise. - """ - parts = name.split() - return len(parts) >= 2 - - def is_sentence_case(name): - - parts = name.split() # Split into individual words - for part in parts: - if not part: # handles empty strings in name - continue - if not part[0].isupper() or not part[1:].islower(): - return False # Check if first letter is uppercase and rest are lowercase - return True - - def is_present_name(self,name): - parts = name.split() - return len(parts) >= 2 - - def is_sentence_case(self,name): - parts = name.split() - for part in parts: - if not part: - continue - if not part[0].isupper() or not part[1:].islower(): - return False - return True - - def extract_project_links(self,sections_text): - project_links = {} - - if "PROJECTS" in sections_text: - project_list = sections_text.get("PROJECTS", []) - url_pattern = r"https?://[^\s]+" - for project in project_list: - links = re.findall(url_pattern,project) - if links: - project_links[project] = links - return project_links - - def count_sentences(self,text): - sentence_endings = r"(? 2 and num_sentences <= 4: - return 3 - elif num_sentences > 4: - return 5 - else: - return 0 - - def calculate_extra_urls_bonus(self,pdf_path): - domains = [ - r"hackerrank\.com", # Hackerrank - r"leetcode\.com", # LeetCode - r"medium\.com" # Medium - ] - extra_urls = self.extract_extra_urls_pdf(pdf_path, domains) - has_extra_urls = any(urls for urls in extra_urls.values()) - return 5 if has_extra_urls else 0 - - def calculate_relevant_experience_score(self, experience_text): - """ - Assigns a score based on the presence of relevant experience keywords. - - Args: - experience_text (str): The extracted work experience section text. - - Returns: - int: A score of 5 if relevant keywords are found, otherwise 0. - """ - if not experience_text: - return 0 # ✅ No experience section → Score 0 - - if isinstance(experience_text, list): - experience_text = " ".join(experience_text) # ✅ Convert list to a single string - - experience_text = experience_text.strip().lower() # ✅ Ensure it's a string and lowercase - - # ✅ Check if any keyword from 'data_science_skills' or 'essential_skills' exists - for skill in config.data_science_skills + config.essential_skills: - if skill.lower() in experience_text: - return 5 # ✅ Found relevant experience → Full score - - return 0 - - def calculate_ds_skills_score(self, skills_present): - if not skills_present: # No skills found at all - return 0 - - # Use skills from config instead of hardcoded list - ds_skills_list_lower = [skill.lower() for skill in config.data_science_skills] - skills_present_lower = [skill.lower() for skill in skills_present] - - matching_count = sum(1 for skill in skills_present_lower - if skill in ds_skills_list_lower) - - if matching_count == 0: # Skills found but none match DS list - return 2 - elif 1 <= matching_count <= 5: - return 3 - elif matching_count > 5: - return 5 - return 0 - - def calculate_project_link_score(self, projects_with_links): - """ - Assigns a score based on whether project links are present. - - Args: - projects_with_links (int): The number of projects with links. - - Returns: - int: 2 if project links are found, otherwise 0. - """ - return 2 if projects_with_links > 0 else 0 - - - def imarticus_review_score(self,name,contact_number,email,linkedin_urls,github_url,missing_sections,sections_not_capitalized,common_projects,section_order_suggestion,sections_text,skills,relevant_experience_score): - score = 0 - if name: - name_parts = name.split() - num_parts = len(name_parts) - - if num_parts == 0: - score += 0 - if self.is_sentence_case(name): - score += 3 - elif self.is_present_name(name): - score += 1.5 - - if contact_number and isinstance(contact_number, str): - digits_only = re.sub(r'\D', '', contact_number) - - if digits_only.startswith("91") and len(digits_only) > 10: - digits_only = digits_only[2:] # Remove the first two characters ('91') - - if len(digits_only) == 10 and digits_only[0] in "6789": # Check for valid Indian mobile numbers - score += 3 - - if email: - score += 3 if self.is_valid_email(email) else 0 - - score += 3 if linkedin_urls else 0 - - if github_url: - github_suggestion = self.is_valid_url(github_url) - score += 3 if not github_suggestion else 0 - else: - score += 0 - - if len(missing_sections)==0 and len(sections_not_capitalized)==0: - score+=10 - elif len(missing_sections)==0 and len(sections_not_capitalized)>0: - score+=8 - elif len(missing_sections)<=3: - score+=6 - elif len(missing_sections)>4: - score+=3 - - if common_projects: - score +=0 - else: - score +=5 - - if section_order_suggestion: - score -= 2 - else: - score - - """ - ds_skills_list_lower = [skill.lower() for skill in data_science_skills] - skills_present_lower = [skill.lower() for skill in self.extract_skills_from_resume(skills) ] - - matching_skill_count = 0 - for skill in skills_present_lower: - if ds_skills_list_lower: - matching_skill_count+=1 - if matching_skill_count==0: - score+=0 - - if matching_skill_count<=5: - score+=2 - elif matching_skill_count>=10 and matching_skill_count<=15: - score+5 - else: - score+=8 - """ - - if "PROJECTS" not in sections_text: - score+=0 - else: - project_list = sections_text.get("PROJECTS",[]) - project_count = len([x for x in project_list if "Description" in x]) - - if project_count<=2: - score+=2 - elif project_count>2 and project_count<=4: - score+=5 - elif project_count>4: - score+=3 - - - resume_data = {} - # Extract projects & links - project_links = self.extract_project_links(sections_text) - projects_with_links = len(project_links) - - # ✅ Count only projects with descriptions - valid_projects = [ - p for p in sections_text.get("PROJECTS", []) if "description" in p.lower() - ] - total_projects = len(valid_projects) # ✅ Count projects properly - - # ✅ Calculate project link score - project_link_score = self.calculate_project_link_score(projects_with_links) - resume_data["project_link_score"] = project_link_score - - # ✅ Prevent division by zero - if total_projects > 0: - if projects_with_links == 0: - score += 0 - elif projects_with_links / total_projects >= 0.5: - score += 1.5 - if projects_with_links == total_projects: - score += 3 - else: - score += 0 # ✅ Ensure no division error if no projects exist - - - """" - profile_summary = sections_text.get("PROFILE SUMMARY", "") - print(profile_summary) - - summary_score = self.calculate_summary_score(profile_summary) - score += summary_score - """ - ds_skills_score = self.calculate_ds_skills_score(skills) - score += ds_skills_score - - certifications = sections_text.get("CERTIFICATIONS & ACADEMIC ENDEAVOURS", []) - num_certifications = len(certifications) - - if num_certifications==0: - score+=0 - elif 0 < num_certifications <= 2: - score+=3 - elif 2 < num_certifications <= 4: - score+=5 - elif num_certifications>4: - score+=7 - """ - extra_urls_bonus = self.calculate_extra_urls_bonus(pdf_path) - score += extra_urls_bonus - """ - score += relevant_experience_score - - score += project_link_score - - return score - - - def imarticus_detailed_score(self, name, contact_number, email, linkedin_urls, github_url, - missing_sections=None, sections_not_capitalized=None, common_projects=None, - section_order_suggestion=None, sections_text=None, skills=None, - relevant_experience_score=0): - - # Ensure lists and dictionaries have default values to avoid 'NoneType' errors - missing_sections = missing_sections or [] - sections_not_capitalized = sections_not_capitalized or [] - common_projects = common_projects or [] - sections_text = sections_text or {} - - score_breakdown = { - "name_score": 0, - "contact_number_score": 0, - "email_score": 0, - "linkedin_url_score": 0, - "github_url_score": 0, - "missing_sections_score": 0, - "common_projects_score": 0, - "section_order_score": 0, - "projects_score": 0, - "certifications_score": 0, - "relevant_experience_score": 0, - "ds_skills_score": 0, - "extra_urls_bonus": 0, - "summary_score": 0, - "project_link_score": 0 - } - - # ✅ Name Score (3 Points) - if name: - if self.is_sentence_case(name): - score_breakdown["name_score"] = 3 - elif self.is_present_name(name): - score_breakdown["name_score"] = 1.5 - - # ✅ Contact Number Score (3 Points) - if contact_number and isinstance(contact_number, str): - digits_only = re.sub(r'\D', '', contact_number) - if digits_only.startswith("91") and len(digits_only) > 10: - digits_only = digits_only[2:] - if len(digits_only) == 10 and digits_only[0] in "6789": - score_breakdown["contact_number_score"] = 3 - - # ✅ Email Score (3 Points) - score_breakdown["email_score"] = 3 if email and self.is_valid_email(email) else 0 - - # ✅ LinkedIn URL Score (3 Points) - score_breakdown["linkedin_url_score"] = 3 if linkedin_urls else 0 - - # ✅ GitHub URL Score (3 Points) - if github_url and self.is_valid_url(github_url): - score_breakdown["github_url_score"] = 3 - - # ✅ Missing Sections Score (10 Points) - if not missing_sections and not sections_not_capitalized: - score_breakdown["missing_sections_score"] = 10 - elif not missing_sections and sections_not_capitalized: - score_breakdown["missing_sections_score"] = 8 - elif len(missing_sections) <= 3: - score_breakdown["missing_sections_score"] = 6 - else: - score_breakdown["missing_sections_score"] = 3 - - # ✅ Common Projects Score (5 Points) - score_breakdown["common_projects_score"] = 0 if common_projects else 5 - - # ✅ Section Order Score (2 Points) - score_breakdown["section_order_score"] = -2 if section_order_suggestion else 0 - - # ✅ Projects Score (5 Points) - if "PROJECTS" in sections_text: - project_list = sections_text.get("PROJECTS", []) - project_count = len([x for x in project_list if "Description" in x]) - if project_count <= 2: - score_breakdown["projects_score"] = 2 - elif 2 < project_count <= 4: - score_breakdown["projects_score"] = 5 - else: - score_breakdown["projects_score"] = 3 - - # ✅ Certifications Score (7 Points) - certifications = sections_text.get("CERTIFICATIONS & ACADEMIC ENDEAVOURS", []) - num_certifications = len(certifications) - if num_certifications == 0: - score_breakdown["certifications_score"] = 0 - elif 0 < num_certifications <= 2: - score_breakdown["certifications_score"] = 3 - elif 2 < num_certifications <= 4: - score_breakdown["certifications_score"] = 5 - else: - score_breakdown["certifications_score"] = 7 - - # ✅ Relevant Experience Score (5 Points) - score_breakdown["relevant_experience_score"] = relevant_experience_score if relevant_experience_score is not None else 0 - - # ✅ Data Science Skills Score (5 Points) - score_breakdown["ds_skills_score"] = self.calculate_ds_skills_score(skills) - - # ✅ Extra URLs Bonus (5 Points) - score_breakdown["extra_urls_bonus"] = self.calculate_extra_urls_bonus(sections_text) - - # ✅ Summary Score (5 Points) - profile_summary = sections_text.get("PROFILE SUMMARY", "") - score_breakdown["summary_score"] = self.calculate_summary_score(profile_summary) - - # ✅ Project Link Score (2 Points) - project_links = self.extract_project_links(sections_text) - projects_with_links = len(project_links) - score_breakdown["project_link_score"] = self.calculate_project_link_score(projects_with_links) - - return score_breakdown - - def parse_text(self, path): - logger = logging.getLogger(__name__) - logging.getLogger("pdfminer").setLevel(logging.WARNING) - resume_data = {} - logger.debug('parsing text') - text = self.extract_text_from_pdf(path) - text1 = " ".join(text.split("\n")) - skills_found = self.extract_skills_from_resume(text) - found_keywords = self.extract_keyword_variations_from_resume(text) - sections_text = self.segregate_sections(text) - formatted_text = self.extract_and_format_sections(sections_text, Extract_sections) - found_keyword_section = self.extract_keyword_variations_from_formatted_text(formatted_text) - - parsed_sections = self.segregate_sections(text) - projects = parsed_sections.get("PROJECTS", []) - certifications = parsed_sections.get("CERTIFICATIONS & ACADEMIC ENDEAVOURS", []) - projects_text = "\n".join(projects) - certifications_text = "\n".join(certifications) - found_imarticus_certification = self.check_imarticus_certifications(certifications_text) - found_projects = self.check_common_projects(projects_text) - - name, name_suggestion = self.extract_name(text) - contact_number, contact_suggestion = self.extract_contact_number_from_resume(text) - email, email_suggestion = self.extract_email_from_resume(path) - github_urls = self.extract_github_urls_from_pdf(path) - github_urls_suggestions = self.is_valid_url(github_urls) - linkedin_urls = self.extract_linkedIn_urls_from_pdf(path) - section_by_grammer_issues = self.grammar_issue_check(text, found_keyword_section, Extract_sections) - - - domains = [ - r"hackerrank\.com", # Hackerrank - r"leetcode\.com", # LeetCode - r"medium\.com" # Medium - ] - extra_urls = self.extract_extra_urls_pdf(path, domains) - - education_order_suggestion, education_suggestion = self.chronological_order_check(sections_text, "ACADEMIC PROFILE") - experience_order_suggestion, experience_suggestion = self.chronological_order_check(sections_text, "WORK EXPERIENCE") - - headers = list(sections_text.keys()) - spelling_suggestions = self.check_spelling(headers, section_headers) - - predefined_terms = [name, email] - predefined_terms.extend(required_sections) - text_properties = self.extract_text_properties(path, predefined_terms) - grouped_properties = self.group_similar_fonts(text_properties) - different_texts = self.identify_different_fonts_and_sizes(grouped_properties) - - font_suggestions = [] - for item in different_texts: - font_suggestion = f"Formatting issue at Page: {item['page_num']}, Text: {item['text']}, Reason: {item['reason']}, Found font size: {item['found_size']}, Found font name: {item['found_font_name']}" - font_suggestions.append(font_suggestion) - - missing_sections, sections_not_capitalized = self.extract_sections_from_resume(text) - - linkedin_urls_suggestion = str() - common_project = str() - if not name: - name_suggestion = "Please add name to the resume." - if not contact_number: - contact_suggestion = "Please add the contact number to the resume." - if not email: - email_suggestion = "Please add the email address to the resume." - if not github_urls: - github_urls_suggestions = "add the github_urls to the resume." - if not linkedin_urls: - linkedin_urls_suggestion = "add the linkedin_urls to the resume." - if found_projects: - common_project = "Common projects found in Projects section: " - for project in found_projects: - common_project += project - - # Replace the existing project length suggestion code with: - project_list = sections_text.get("PROJECTS", []) - projects_with_description = [ - p for p in project_list - if "description" in p.lower() - ] - project_count = len(projects_with_description) - - if project_count == 0: - project_length_suggestion = "No projects found. Consider at least 2 projects." - elif project_count == 1: - project_length_suggestion = "Only 1 project found. Consider adding 1 more project." - else: - project_length_suggestion = f"{project_count} projects found." - - # Store in resume data (keeps your existing URL extraction) - resume_data["project_length_suggestion"] = project_length_suggestion - - experience_text = sections_text.get("WORK EXPERIENCE", "") # ✅ Extract work experience section - relevant_experience_score = self.calculate_relevant_experience_score(experience_text) # ✅ Calculate score - - # ✅ Store in the final resume data output - resume_data["relevant_experience_score"] = relevant_experience_score - - - recommended_blogs = random.sample(blog_articles, 2) - recommended_youtube = random.sample(youtube_links, 2) - - # Calculate imarticus_score - imarticus_score = self.imarticus_review_score( - name, - contact_number, - email, - linkedin_urls, - github_urls, - missing_sections, - sections_not_capitalized, - common_projects=found_projects, # Ensure to pass found projects - section_order_suggestion=experience_order_suggestion, - skills=skills_found, # Pass order suggestion - sections_text=sections_text, - relevant_experience_score=relevant_experience_score, - # project_link_score=project_link_score - #pdf_path=path - #relevant_keywords_found=bool(found_keywords), # Convert to boolean - #experience_orderly_arranged=experience_order_suggestion, # Pass orderly arrangement check - #experience_section_present="WORK EXPERIENCE" in sections_text # Check if experience section is present - ) - - # Populate resume data dictionary - resume_data = { - "name": name, - "contact_number": contact_number, - "email": email, - "linkedin_urls": linkedin_urls, - "experience_order_suggestion": experience_order_suggestion, - "education_order_suggestion": education_order_suggestion, - "grammer_issues_by_section": section_by_grammer_issues, - "github_urls": github_urls, - "skills": skills_found, - "spelling_suggestions": spelling_suggestions, - "found_keywords": found_keywords, - "text": text, - "font_suggestions": font_suggestions, - "name_suggestion": name_suggestion, - "contact_suggestion": contact_suggestion, - "email_suggestion": email_suggestion, - "imarticus_score": imarticus_score, - "github_urls_suggestions": github_urls_suggestions, - "linkedin_urls_suggestion": "Add the LinkedIn URLs to the resume." if not linkedin_urls else "", - "missing_sections": missing_sections, - "common_projects": "Common projects found in Projects section: " + ", ".join(found_projects) if found_projects else "", - "project_length_suggestion": project_length_suggestion, - "extra_urls": extra_urls, - "certifications": { - "found": found_imarticus_certification["found"], - "message": found_imarticus_certification["message"], - "text": certifications_text # Store extracted certification text - }, - "recommended_blogs": recommended_blogs, - "recommended_youtube_links": recommended_youtube - } - - # Additional checks and data additions - if "WORK EXPERIENCE" in sections_text.keys() and "WORK EXPERIENCE" != list(sections_text.keys())[2]: - section_order_suggestion = f"WORK EXPERIENCE should come before {list(sections_text.keys())[2]}" - resume_data["section_order_suggestion"] = section_order_suggestion - - missing_important_sections = self.check_missing_sections(resume_data) - resume_data["basic_information_section"] = missing_important_sections or "Basic information is Found" - - missing_skills = list(set(essential_skills) - set(skills_found)) - resume_data["missing_skills"] = missing_skills - - found_keywords_count = len(resume_data["found_keywords"]) - num_keywords = len(keyword_variations) - quality_mapping = {"Low": 0.2, "Medium": 0.5, "High": 0.8} # Assuming some quality mapping - for quality, threshold in quality_mapping.items(): - if found_keywords_count < num_keywords * threshold: - resume_data["quality"] = quality - break - - found_certification = "Imarticus certification found in Certifications section." if found_imarticus_certification else "No Imarticus certification found in Certifications section." - resume_data["found_certification"] = found_certification - - # Experience relevance check - Extract_exp_sections = ['WORK EXPERIENCE'] - experience_text = self.extract_and_format_sections(sections_text, Extract_exp_sections) - if experience_text: - resume_data["work_experience_check"] = "Experience is relevant to Data science." if any(variation.lower() in experience_text.lower() for keyword, variations in keyword_variations.items() for variation in variations) else "Experience is not relevant to Data science." - - return jsonify(resume_data) +#total score = 67 +# python file to parse different section from resume +from pdfminer.high_level import extract_pages, extract_text +from pdfminer.layout import LTTextContainer, LTChar, LTTextLineHorizontal +from collections import defaultdict +from flask import jsonify +import re, fitz, requests, logging, datetime +import src.config as config +from .config import data_science_skills, keyword_variations, essential_skills, quality_mapping, Extract_sections, suggested_projects, ignore_rule_ids +from .config import required_sections, linkedin_domain, github_domain, basic_informations, section_headers, common_projects, ignore_error_keywords,blog_articles,youtube_links +from .config import kaggle_domain,hackerrank_domain,leetcode_domain,medium_domain +from spacy.matcher import Matcher +import language_tool_python +from collections import defaultdict +import random +tool = language_tool_python.LanguageTool('en-US',use_signal_handler=False) + + + +class ResumeParser: + + def extract_contact_number_from_resume(self, text): + contact_number = None + suggestion = "" + + # Use regex pattern to find a potential contact number + pattern = r"\b(?:\+?\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b" + match = re.search(pattern, text) + if match: + contact_number = match.group() + # Check if the contact number is of the correct length + digits_only = re.sub(r'\D', '', contact_number) + if len(digits_only) == 10: + suggestion = "" + elif len(digits_only) > 10 and digits_only.startswith('91') and len(digits_only[2:]) == 10: + suggestion = "" + else: + suggestion = "Contact number should have exactly 10 digits." + + return contact_number, suggestion + + + + def extract_hyperlinks(self, pdf_path): + doc = fitz.open(pdf_path) + links = [] + + for page_num in range(len(doc)): + page = doc.load_page(page_num) + link_list = page.get_links() + for link in link_list: + uri = link.get('uri', None) + if uri: + links.append(uri) + + return links + + def extract_text_from_pdf(self, pdf_path): + return extract_text(pdf_path) + + def extract_email_from_text(self, text): + pattern = r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b" + match = re.search(pattern, text) + if match: + return match.group() + return None + + def extract_email_from_resume(self, pdf_path): + text = self.extract_text_from_pdf(pdf_path) + email = self.extract_email_from_text(text) + suggestion = "" + + # If no email found in text, check hyperlinks + if not email: + links = self.extract_hyperlinks(pdf_path) + for link in links: + if link.startswith('mailto:'): + email_candidate = link.split('mailto:')[1] + if self.is_valid_email(email_candidate): + email = email_candidate + break + + # Additional validation for email found in text or links + if email and not self.is_valid_email(email): + suggestion += "Your email address doesn't seem to be valid. Please check and correct." + + return email, suggestion + + + def is_valid_email(self, email): + # Length check + if len(email) > 254: + return False + + # Consecutive special characters check + if re.search(r"[._%+-]{2,}", email): + return False + + # Domain part validation + domain_part = email.split('@')[1] + if not re.match(r"[A-Za-z0-9.-]+\.[A-Za-z]{2,}", domain_part): + return False + + # Standard email format check + pattern = r"^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$" + return re.match(pattern, email) is not None + + + def extract_sections_from_resume(self, text): + missing_sections = [] + sections_not_capitalized = [] + + for section in required_sections: + pattern = r"\b{}\b".format(re.escape(section)) + + match_obj = re.search(pattern, text, re.IGNORECASE) + if not match_obj: + missing_sections.append(section) + else: + if match_obj.group() not in map(str.upper, required_sections): + sections_not_capitalized.append(section) + + return missing_sections, sections_not_capitalized + + def extract_skills_from_resume(self, text): + if not isinstance(text, str): + raise ValueError(f"Expected 'text' to be a string, but got {type(text)}") + + skills = [] + for skill in essential_skills: + pattern = r"\b{}\b".format(re.escape(skill)) + match = re.search(pattern, text, re.IGNORECASE) + if match: + skills.append(skill) + return skills + + def extract_keyword_variations_from_resume(self, text): + found_keywords = [] + for keyword, variations in keyword_variations.items(): + for variation in variations: + if variation.lower() in text.lower(): + found_keywords.append(variation) + break + + return found_keywords + + def extract_keyword_variations_from_formatted_text(self, formatted_text): + found_keyword_section = [] + for keyword, variations in keyword_variations.items(): + for variation in variations: + if variation.lower() in formatted_text.lower(): + found_keyword_section.append(variation) + break + + return found_keyword_section + + def extract_linkedIn_urls_from_pdf(self, pdf_path): + linkedin_urls = None + pdf_document = fitz.open(pdf_path) + for page_num in range(len(pdf_document)): + page = pdf_document.load_page(page_num) + links = page.get_links() + for link in links: + url = link.get('uri', '') + if re.search(linkedin_domain, url): + linkedin_urls = url + pdf_document.close() + return linkedin_urls + + def extract_github_urls_from_pdf(self, pdf_path): + github_urls = None + pdf_document = fitz.open(pdf_path) + for page_num in range(len(pdf_document)): + page = pdf_document.load_page(page_num) + links = page.get_links() + for link in links: + url = link.get('uri', '') + if re.search(github_domain, url): + path = re.sub(github_domain, '', url) + parts = path.split('/') + if len(parts) == 1: + github_urls = url + pdf_document.close() + return github_urls + + + def extract_extra_urls_pdf(self,pdf_path, domains): + extracted_urls = defaultdict(set) + try: + # Open the PDF document + pdf_document = fitz.open(pdf_path) + + # Iterate through all pages in the PDF + for page_num in range(len(pdf_document)): + page = pdf_document.load_page(page_num) + links = page.get_links() + + for link in links: + url = link.get('uri', '') + if url: # Ensure there's a URL + for domain in domains: + if re.search(domain, url, re.IGNORECASE): + extracted_urls[domain].add(url) # Add URL to the domain's set + except Exception as e: + print(f"Error processing PDF: {e}") + finally: + pdf_document.close() + + return {domain: list(urls) for domain, urls in extracted_urls.items()} + + def is_valid_url(self , github_urls ): + suggest = "" + for _ in [github_urls]: + if not github_urls: + break + + try: + response = requests.head(github_urls) + if response.status_code != 200: + suggest = "GitHub URL is not valid, please check and correct. " + except requests.RequestException: + suggest = "GitHub URL is not valid, please check and correct. " + + return suggest + return suggest + + + def is_valid_name(self, name): + if any(char.isdigit() for char in name): + return False + if len(name.split()) > 3: + return False + common_non_names = {"Email", "Github", "LinkedIn", "Portfolio", "Data Analyst"} + if name in common_non_names: + return False + return True + + def extract_name(self, resume_text): + + lines = resume_text.split('\n') + + # Use regex to find lines that likely contain names + name_lines = [line for line in lines if re.match(r'^[A-Za-z]*\s[A-Za-z]*', line.strip())] + + names = [] + for i in range(len(name_lines)): + if self.is_valid_name(name_lines[i].strip()): + names.append(name_lines[i].strip()) + + if len(names) >= 1: + name = names[0] + suggestion = "" + # Check if the name parts contain only alphabetic characters + name_parts = name.split() + if any(part[0].islower() for part in name_parts): + suggestion += " name should start with a capital letter. " + return name, suggestion + + return None, "No valid name found" + + + def check_missing_sections(self, resume_data): + missing_information = [] + for section in basic_informations: + if not resume_data.get(section): + missing_information.append(section) + return missing_information + + def segregate_sections(self, text): + header_pattern = re.compile(rf'^\s*({"|".join(re.escape(header) for header in section_headers)}):?\s*$', re.IGNORECASE) + sections_text = {} + current_section = None + lines = text.splitlines() + for line in lines: + clean_line = line.strip() + match = header_pattern.match(clean_line) + if match: + current_section = match.group(1).upper() + sections_text[current_section] = [] + elif current_section: + sections_text[current_section].append(line.strip()) + + return sections_text + + def extract_and_format_sections(self, sections_text, Extract_sections): + formatted_text = "" + for section in Extract_sections: + if section in sections_text: + section_content = " ".join(sections_text[section]).replace('\n', ' ') + formatted_text += f"{section}:\n{section_content}\n\n" + return formatted_text + + def replace_keywords_with_placeholders(self, formatted_text, found_keyword_section): + placeholder_text = formatted_text + keyword_placeholders = {} + + # Use a set to avoid duplicates and keep track of keyword placeholders + used_keywords = set() + for i, keyword in enumerate(found_keyword_section): + if keyword not in used_keywords: + used_keywords.add(keyword) + placeholder = f"{{KEYWORD_{i}}}" + keyword_placeholders[placeholder] = keyword + # Using word boundary to match whole words + placeholder_text = re.sub(r'\b' + re.escape(keyword) + r'\b', placeholder, placeholder_text, flags=re.IGNORECASE) + + return placeholder_text, keyword_placeholders + + def replace_placeholders_with_keywords(self, grammar_issues, keyword_placeholders): + updated_issues = [] + for issue in grammar_issues: + context = issue['context'] + for placeholder, keyword in keyword_placeholders.items(): + context = context.replace(placeholder, keyword) + # Update the context in the issue dictionary + issue['context'] = context + updated_issues.append(issue) + return updated_issues + + def grammar_check(self, placeholder_text): + matches = tool.check(placeholder_text) + grammar_issues = [] + for match in matches: + issue = { + "context": match.context, + "error": match.message, + "rule_id": match.ruleId, + "suggested_correction": match.replacements + } + grammar_issues.append(issue) + return grammar_issues + + def filter_grammar_issues(self, grammar_issues, ignore_rule_ids=None, ignore_error_keywords=None): + if ignore_rule_ids is None: + ignore_rule_ids = [] + if ignore_error_keywords is None: + ignore_error_keywords = [] + + filtered_issues = [] + for issue in grammar_issues: + if issue['rule_id'] not in ignore_rule_ids and not any(keyword in issue['error'] for keyword in ignore_error_keywords): + filtered_issues.append(issue) + + return filtered_issues + + def process_resume(self, text, found_keyword_section, Extract_sections): + sections_text = self.segregate_sections(text) + formatted_text = self.extract_and_format_sections(sections_text, Extract_sections) + found_keyword_section = self.extract_keyword_variations_from_formatted_text(formatted_text) + placeholder_text, keyword_placeholders = self.replace_keywords_with_placeholders(formatted_text, found_keyword_section) + grammar_issues = self.grammar_check(placeholder_text) + grammar_issues_text = self.replace_placeholders_with_keywords(grammar_issues, keyword_placeholders) + filtered_grammar_issues = self.filter_grammar_issues(grammar_issues, ignore_rule_ids, ignore_error_keywords) + return { + "grammar_issues": filtered_grammar_issues, + "spelling_errors": [issue for issue in filtered_grammar_issues if "SPELLING" in issue['rule_id']] + } + + def grammar_issue_check(self, text, found_keyword_section, Extract_sections): + issues = {} + text1 = " ".join(text.split("\n")) + for section in Extract_sections: + grammar_issues = self.process_resume(text, found_keyword_section, [section]) + if not grammar_issues: + grammar_issues = "no error found" + issues[section] = grammar_issues + return issues + + def normalize_font_name(self,font_name): + if '-' in font_name: + font_name = font_name.split('-')[0] + if '+' in font_name: + font_name = font_name.split('+')[1] + return font_name + + + def extract_text_properties(self, pdf_path, predefined_terms): + text_properties = [] + current_phrase = "" + current_font_size = None + current_font_name = None + current_page_num = None + + special_characters = set("●▪•!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~") + + def add_current_phrase(): + nonlocal current_phrase + if current_phrase.strip(): + flag = any(current_phrase in term for term in predefined_terms) + if not flag: + text_properties.append({ + "text": current_phrase, + "font_size": current_font_size, + "font_name": current_font_name, + "page_num": current_page_num + }) + current_phrase = "" + + for page_layout in extract_pages(pdf_path): + for element in page_layout: + if isinstance(element, LTTextContainer): + for text_line in element: + if isinstance(text_line, LTTextLineHorizontal): + for character in text_line: + if isinstance(character, LTChar): + text = character.get_text() + font_size = round(character.size, 2) + font_name = self.normalize_font_name(character.fontname) + page_num = page_layout.pageid + + if text.isspace() or text in special_characters: + add_current_phrase() + continue + + if (font_size != current_font_size or font_name != current_font_name or + page_num != current_page_num): + add_current_phrase() + current_font_size = font_size + current_font_name = font_name + current_page_num = page_num + + current_phrase += text + + add_current_phrase() + + return text_properties + + def group_similar_fonts(self,text_properties, tolerance=0.5): + grouped_properties = defaultdict(list) + + for prop in text_properties: + rounded_size = round(prop["font_size"] / tolerance) * tolerance + key = (prop["font_name"], rounded_size) + grouped_properties[key].append(prop) + + return grouped_properties + + + + + def identify_different_fonts_and_sizes(self, grouped_properties): + most_common_group = max(grouped_properties.values(), key=len) + most_common_key = None + for key, group in grouped_properties.items(): + if group == most_common_group: + most_common_key = key + break + + different_texts = [] + + for key, group in grouped_properties.items(): + if group != most_common_group: + for prop in group: + reason = [] + if key[1] != most_common_key[1]: + reason.append(f"size not {most_common_key[1]}") + if key[0] != most_common_key[0]: + reason.append(f"font not {most_common_key[0]}") + different_texts.append({ + "page_num": prop['page_num'], + "text": prop['text'], + "found_size": prop['font_size'], + "found_font_name": prop['font_name'], + "reason": ", ".join(reason) + }) + + return different_texts + + def parse_dates(self, sections_text, section_name): + # Check if the section is in the text + suggest = "" + + # Define the date patterns to match various date formats + date_pattern = ( + r'\b\d{1,2}/\d{4}\b|' # MM/YYYY + r'\b(?:jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|jul(?:y)?|aug(?:ust)?|sep(?:tember)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?)\s+\d{4}\b|' # Month YYYY + r'\b(?:jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|jul(?:y)?|aug(?:ust)?|sep(?:tember)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?)\s+\d{1,2},?\s*\d{4}\b|' # Month DD, YYYY + r'\b\d{4}\b|' # YYYY + r'\b(?:jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|jul(?:y)?|aug(?:ust)?|sep(?:tember)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?)[a-z]*/?\d{4}\b|' # Month/YYYY + r'\b(?:jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|jul(?:y)?|aug(?:ust)?|sep(?:tember)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?)[a-z]*\d{4}\s*-\s*(?:jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|jul(?:y)?|aug(?:ust)?|sep(?:tember)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?)[a-z]*\d{4}\b' # Month/YYYY - Month/YYYY + ) + + all_dates = [] + + # Iterate over the entries in the section_name + for entry in sections_text[section_name]: + entry = entry.lower() + matches = re.findall(date_pattern, entry) + if matches and len(matches)>1: + if len(matches) == 2: + all_dates.append(f"{matches[0]} {matches[1]}") + else: + all_dates.extend(matches) + + return all_dates + + + def convert_to_date(self, date_str): + # Mapping of month names and abbreviations to their numeric equivalents + month_map = { + 'jan': 1, 'january': 1, 'feb': 2, 'february': 2, + 'mar': 3, 'march': 3, 'apr': 4, 'april': 4, + 'may': 5, 'jun': 6, 'june': 6, 'jul': 7, + 'july': 7, 'aug': 8, 'august': 8, 'sep': 9, + 'september': 9, 'oct': 10, 'october': 10, + 'nov': 11, 'november': 11, 'dec': 12, 'december': 12, + '01': 1, '02': 2, '03': 3, '04': 4, + '05': 5, '06': 6, '07': 7, '08': 8, + '09': 9, '10': 10, '11': 11, '12': 12 + } + + # Regex patterns to match different date formats + pattern_mm_yyyy = re.compile(r'(\d{1,2})/(\d{4})') + pattern_mm_yyyy_space = re.compile(r'(\d{1,2})\s(\d{4})') + pattern_month_yyyy = re.compile(r'([a-zA-Z]+)\s?(\d{4})') + pattern_yyyy = re.compile(r'(\d{4})') + + def extract_date(date_str): + match_mm_yyyy = pattern_mm_yyyy.match(date_str) + match_mm_yyyy_space = pattern_mm_yyyy_space.match(date_str) + match_month_yyyy = pattern_month_yyyy.match(date_str) + match_yyyy = pattern_yyyy.match(date_str) + + if match_mm_yyyy: + month = int(match_mm_yyyy.group(1)) + year = int(match_mm_yyyy.group(2)) + elif match_mm_yyyy_space: + month = int(match_mm_yyyy_space.group(1)) + year = int(match_mm_yyyy_space.group(2)) + elif match_month_yyyy: + month = month_map.get(match_month_yyyy.group(1).lower()) + year = int(match_month_yyyy.group(2)) + elif match_yyyy: + month = 1 + year = int(match_yyyy.group(1)) + else: + return [] + + return datetime.date(year, month, 1) + + date_parts = re.findall(r'(\d{4}\s[a-zA-Z]+\s?|\d{4}[a-zA-Z]+|\d{4}\/\d{2}|\d{4}\s\d{2}|[a-zA-Z]+\s?\d{4}|\d{4}\s[a-zA-Z]+)', date_str) + if len(date_parts) == 1: + # Standalone year or single date + start_date = extract_date(date_parts[0]) + end_date = datetime.date(start_date.year, start_date.month, start_date.day) + elif len(date_parts) == 2: + # Date range + start_date = extract_date(date_parts[0]) + end_date = extract_date(date_parts[1]) + else: + return [] + + return start_date, end_date + + + def date_time(self, date_parts): + converted_dates = [] + for date_part in date_parts: + start_date, end_date = self.convert_to_date(date_part) + converted_dates.append((start_date, end_date)) + return converted_dates + + + def check_chronological_order(self, converted_dates, section_name ): + suggestion = "" + sorted_dates = sorted(converted_dates, key=lambda x: (x[1], x[0]), reverse=True) + if converted_dates == sorted_dates: + suggestion = f"{section_name} section is in chronological order." + else: + suggestion = f"{section_name} section is not in chronological order." + + return suggestion + + def check_common_projects(self, projects_text): + found_projects = [] + for project in common_projects: + if project.lower() in projects_text.lower(): + found_projects.append(project) + return found_projects + + def recommend_resources(): + # Randomly pick 2 blog articles and 2 YouTube links + recommended_blogs = random.sample(blog_articles, 2) + recommended_youtube = random.sample(youtube_links, 2) + + # Return the recommendations + return { + "Recommended Blogs": recommended_blogs, + "Recommended YouTube Links": recommended_youtube + } + + def check_imarticus_certifications(self, certifications_text): + # Check if "imarticus" is present in the certifications text + if "imarticus" in certifications_text.lower(): + return { + "found": True, + "message": "Imarticus certification found. Please upload it in the academic section." + } + return { + "found": False, + "message": "No Imarticus certification found in the provided text." + } + + + def chronological_order_check(self, sections_text, section_name): + order_suggestion = "" + suggestion = "" + section_name = section_name.upper() + if section_name in sections_text: + date = self.parse_dates(sections_text, section_name) + if date: + converted_dates = self.date_time(date) + order_suggestion = self.check_chronological_order(converted_dates, section_name) + else: + suggestion = f"No valid dates found in {section_name} section. " + else: + suggestion = f"{section_name} is not in section header. " + + return order_suggestion, suggestion + + + + # Function to check for spelling mistakes + def check_spelling(self, headers, section_headers): + suggestions = [] + for header in headers: + if header.upper() not in map(str.upper, section_headers): + suggestions = header + return suggestions + + def is_present_name(name): + """ + Checks if a given name has at least 2 words. + + Args: + name: The name string to check. + + Returns: + True if it has at least 2 words, false otherwise. + """ + parts = name.split() + return len(parts) >= 2 + + def is_sentence_case(name): + + parts = name.split() # Split into individual words + for part in parts: + if not part: # handles empty strings in name + continue + if not part[0].isupper() or not part[1:].islower(): + return False # Check if first letter is uppercase and rest are lowercase + return True + + def is_present_name(self,name): + parts = name.split() + return len(parts) >= 2 + + def is_sentence_case(self,name): + parts = name.split() + for part in parts: + if not part: + continue + if not part[0].isupper() or not part[1:].islower(): + return False + return True + + def extract_project_links(self,sections_text): + project_links = {} + + if "PROJECTS" in sections_text: + project_list = sections_text.get("PROJECTS", []) + url_pattern = r"https?://[^\s]+" + for project in project_list: + links = re.findall(url_pattern,project) + if links: + project_links[project] = links + return project_links + + def count_sentences(self,text): + sentence_endings = r"(? 2 and num_sentences <= 4: + return 3 + elif num_sentences > 4: + return 5 + else: + return 0 + + def calculate_extra_urls_bonus(self,pdf_path): + domains = [ + r"hackerrank\.com", # Hackerrank + r"leetcode\.com", # LeetCode + r"medium\.com" # Medium + ] + extra_urls = self.extract_extra_urls_pdf(pdf_path, domains) + has_extra_urls = any(urls for urls in extra_urls.values()) + return 5 if has_extra_urls else 0 + + def calculate_relevant_experience_score(self, experience_text): + """ + Assigns a score based on the presence of relevant experience keywords. + + Args: + experience_text (str): The extracted work experience section text. + + Returns: + int: A score of 5 if relevant keywords are found, otherwise 0. + """ + if not experience_text: + return 0 # ✅ No experience section → Score 0 + + if isinstance(experience_text, list): + experience_text = " ".join(experience_text) # ✅ Convert list to a single string + + experience_text = experience_text.strip().lower() # ✅ Ensure it's a string and lowercase + + # ✅ Check if any keyword from 'data_science_skills' or 'essential_skills' exists + for skill in config.data_science_skills + config.essential_skills: + if skill.lower() in experience_text: + return 5 # ✅ Found relevant experience → Full score + + return 0 + + def calculate_ds_skills_score(self, skills_present): + if not skills_present: # No skills found at all + return 0 + + # Use skills from config instead of hardcoded list + ds_skills_list_lower = [skill.lower() for skill in config.data_science_skills] + skills_present_lower = [skill.lower() for skill in skills_present] + + matching_count = sum(1 for skill in skills_present_lower + if skill in ds_skills_list_lower) + + if matching_count == 0: # Skills found but none match DS list + return 2 + elif 1 <= matching_count <= 5: + return 3 + elif matching_count > 5: + return 5 + return 0 + + def calculate_project_link_score(self, projects_with_links): + """ + Assigns a score based on whether project links are present. + + Args: + projects_with_links (int): The number of projects with links. + + Returns: + int: 2 if project links are found, otherwise 0. + """ + return 2 if projects_with_links > 0 else 0 + + + def imarticus_review_score(self,name,contact_number,email,linkedin_urls,github_url,missing_sections,sections_not_capitalized,common_projects,section_order_suggestion,sections_text,skills,relevant_experience_score): + score = 0 + if name: + name_parts = name.split() + num_parts = len(name_parts) + + if num_parts == 0: + score += 0 + if self.is_sentence_case(name): + score += 3 + elif self.is_present_name(name): + score += 1.5 + + if contact_number and isinstance(contact_number, str): + digits_only = re.sub(r'\D', '', contact_number) + + if digits_only.startswith("91") and len(digits_only) > 10: + digits_only = digits_only[2:] # Remove the first two characters ('91') + + if len(digits_only) == 10 and digits_only[0] in "6789": # Check for valid Indian mobile numbers + score += 3 + + if email: + score += 3 if self.is_valid_email(email) else 0 + + score += 3 if linkedin_urls else 0 + + if github_url: + github_suggestion = self.is_valid_url(github_url) + score += 3 if not github_suggestion else 0 + else: + score += 0 + + if len(missing_sections)==0 and len(sections_not_capitalized)==0: + score+=10 + elif len(missing_sections)==0 and len(sections_not_capitalized)>0: + score+=8 + elif len(missing_sections)<=3: + score+=6 + elif len(missing_sections)>4: + score+=3 + + if common_projects: + score +=0 + else: + score +=5 + + if section_order_suggestion: + score -= 2 + else: + score + + """ + ds_skills_list_lower = [skill.lower() for skill in data_science_skills] + skills_present_lower = [skill.lower() for skill in self.extract_skills_from_resume(skills) ] + + matching_skill_count = 0 + for skill in skills_present_lower: + if ds_skills_list_lower: + matching_skill_count+=1 + if matching_skill_count==0: + score+=0 + + if matching_skill_count<=5: + score+=2 + elif matching_skill_count>=10 and matching_skill_count<=15: + score+5 + else: + score+=8 + """ + + if "PROJECTS" not in sections_text: + score+=0 + else: + project_list = sections_text.get("PROJECTS",[]) + project_count = len([x for x in project_list if "Description" in x]) + + if project_count<=2: + score+=2 + elif project_count>2 and project_count<=4: + score+=5 + elif project_count>4: + score+=3 + + + resume_data = {} + # Extract projects & links + project_links = self.extract_project_links(sections_text) + projects_with_links = len(project_links) + + # ✅ Count only projects with descriptions + valid_projects = [ + p for p in sections_text.get("PROJECTS", []) if "description" in p.lower() + ] + total_projects = len(valid_projects) # ✅ Count projects properly + + # ✅ Calculate project link score + project_link_score = self.calculate_project_link_score(projects_with_links) + resume_data["project_link_score"] = project_link_score + + # ✅ Prevent division by zero + if total_projects > 0: + if projects_with_links == 0: + score += 0 + elif projects_with_links / total_projects >= 0.5: + score += 1.5 + if projects_with_links == total_projects: + score += 3 + else: + score += 0 # ✅ Ensure no division error if no projects exist + + + """" + profile_summary = sections_text.get("PROFILE SUMMARY", "") + print(profile_summary) + + summary_score = self.calculate_summary_score(profile_summary) + score += summary_score + """ + ds_skills_score = self.calculate_ds_skills_score(skills) + score += ds_skills_score + + certifications = sections_text.get("CERTIFICATIONS & ACADEMIC ENDEAVOURS", []) + num_certifications = len(certifications) + + if num_certifications==0: + score+=0 + elif 0 < num_certifications <= 2: + score+=3 + elif 2 < num_certifications <= 4: + score+=5 + elif num_certifications>4: + score+=7 + """ + extra_urls_bonus = self.calculate_extra_urls_bonus(pdf_path) + score += extra_urls_bonus + """ + score += relevant_experience_score + + score += project_link_score + + return score + + + def imarticus_detailed_score(self, name, contact_number, email, linkedin_urls, github_url, + missing_sections=None, sections_not_capitalized=None, common_projects=None, + section_order_suggestion=None, sections_text=None, skills=None, + relevant_experience_score=0): + + # Ensure lists and dictionaries have default values to avoid 'NoneType' errors + missing_sections = missing_sections or [] + sections_not_capitalized = sections_not_capitalized or [] + common_projects = common_projects or [] + sections_text = sections_text or {} + + score_breakdown = { + "name_score": 0, + "contact_number_score": 0, + "email_score": 0, + "linkedin_url_score": 0, + "github_url_score": 0, + "missing_sections_score": 0, + "common_projects_score": 0, + "section_order_score": 0, + "projects_score": 0, + "certifications_score": 0, + "relevant_experience_score": 0, + "ds_skills_score": 0, + "extra_urls_bonus": 0, + "summary_score": 0, + "project_link_score": 0 + } + + # ✅ Name Score (3 Points) + if name: + if self.is_sentence_case(name): + score_breakdown["name_score"] = 3 + elif self.is_present_name(name): + score_breakdown["name_score"] = 1.5 + + # ✅ Contact Number Score (3 Points) + if contact_number and isinstance(contact_number, str): + digits_only = re.sub(r'\D', '', contact_number) + if digits_only.startswith("91") and len(digits_only) > 10: + digits_only = digits_only[2:] + if len(digits_only) == 10 and digits_only[0] in "6789": + score_breakdown["contact_number_score"] = 3 + + # ✅ Email Score (3 Points) + score_breakdown["email_score"] = 3 if email and self.is_valid_email(email) else 0 + + # ✅ LinkedIn URL Score (3 Points) + score_breakdown["linkedin_url_score"] = 3 if linkedin_urls else 0 + + # ✅ GitHub URL Score (3 Points) + if github_url and self.is_valid_url(github_url): + score_breakdown["github_url_score"] = 3 + + # ✅ Missing Sections Score (10 Points) + if not missing_sections and not sections_not_capitalized: + score_breakdown["missing_sections_score"] = 10 + elif not missing_sections and sections_not_capitalized: + score_breakdown["missing_sections_score"] = 8 + elif len(missing_sections) <= 3: + score_breakdown["missing_sections_score"] = 6 + else: + score_breakdown["missing_sections_score"] = 3 + + # ✅ Common Projects Score (5 Points) + score_breakdown["common_projects_score"] = 0 if common_projects else 5 + + # ✅ Section Order Score (2 Points) + score_breakdown["section_order_score"] = -2 if section_order_suggestion else 0 + + # ✅ Projects Score (5 Points) + if "PROJECTS" in sections_text: + project_list = sections_text.get("PROJECTS", []) + project_count = len([x for x in project_list if "Description" in x]) + if project_count <= 2: + score_breakdown["projects_score"] = 2 + elif 2 < project_count <= 4: + score_breakdown["projects_score"] = 5 + else: + score_breakdown["projects_score"] = 3 + + # ✅ Certifications Score (7 Points) + certifications = sections_text.get("CERTIFICATIONS & ACADEMIC ENDEAVOURS", []) + num_certifications = len(certifications) + if num_certifications == 0: + score_breakdown["certifications_score"] = 0 + elif 0 < num_certifications <= 2: + score_breakdown["certifications_score"] = 3 + elif 2 < num_certifications <= 4: + score_breakdown["certifications_score"] = 5 + else: + score_breakdown["certifications_score"] = 7 + + # ✅ Relevant Experience Score (5 Points) + score_breakdown["relevant_experience_score"] = relevant_experience_score if relevant_experience_score is not None else 0 + + # ✅ Data Science Skills Score (5 Points) + score_breakdown["ds_skills_score"] = self.calculate_ds_skills_score(skills) + + # ✅ Extra URLs Bonus (5 Points) + score_breakdown["extra_urls_bonus"] = self.calculate_extra_urls_bonus(sections_text) + + # ✅ Summary Score (5 Points) + profile_summary = sections_text.get("PROFILE SUMMARY", "") + score_breakdown["summary_score"] = self.calculate_summary_score(profile_summary) + + # ✅ Project Link Score (2 Points) + project_links = self.extract_project_links(sections_text) + projects_with_links = len(project_links) + score_breakdown["project_link_score"] = self.calculate_project_link_score(projects_with_links) + + return score_breakdown + + def parse_text(self, path): + logger = logging.getLogger(__name__) + logging.getLogger("pdfminer").setLevel(logging.WARNING) + resume_data = {} + logger.debug('parsing text') + text = self.extract_text_from_pdf(path) + text1 = " ".join(text.split("\n")) + skills_found = self.extract_skills_from_resume(text) + found_keywords = self.extract_keyword_variations_from_resume(text) + sections_text = self.segregate_sections(text) + formatted_text = self.extract_and_format_sections(sections_text, Extract_sections) + found_keyword_section = self.extract_keyword_variations_from_formatted_text(formatted_text) + + parsed_sections = self.segregate_sections(text) + projects = parsed_sections.get("PROJECTS", []) + certifications = parsed_sections.get("CERTIFICATIONS & ACADEMIC ENDEAVOURS", []) + projects_text = "\n".join(projects) + certifications_text = "\n".join(certifications) + found_imarticus_certification = self.check_imarticus_certifications(certifications_text) + found_projects = self.check_common_projects(projects_text) + + name, name_suggestion = self.extract_name(text) + contact_number, contact_suggestion = self.extract_contact_number_from_resume(text) + email, email_suggestion = self.extract_email_from_resume(path) + github_urls = self.extract_github_urls_from_pdf(path) + github_urls_suggestions = self.is_valid_url(github_urls) + linkedin_urls = self.extract_linkedIn_urls_from_pdf(path) + section_by_grammer_issues = self.grammar_issue_check(text, found_keyword_section, Extract_sections) + + + domains = [ + r"hackerrank\.com", # Hackerrank + r"leetcode\.com", # LeetCode + r"medium\.com" # Medium + ] + extra_urls = self.extract_extra_urls_pdf(path, domains) + + education_order_suggestion, education_suggestion = self.chronological_order_check(sections_text, "ACADEMIC PROFILE") + experience_order_suggestion, experience_suggestion = self.chronological_order_check(sections_text, "WORK EXPERIENCE") + + headers = list(sections_text.keys()) + spelling_suggestions = self.check_spelling(headers, section_headers) + + predefined_terms = [name, email] + predefined_terms.extend(required_sections) + text_properties = self.extract_text_properties(path, predefined_terms) + grouped_properties = self.group_similar_fonts(text_properties) + different_texts = self.identify_different_fonts_and_sizes(grouped_properties) + + font_suggestions = [] + for item in different_texts: + font_suggestion = f"Formatting issue at Page: {item['page_num']}, Text: {item['text']}, Reason: {item['reason']}, Found font size: {item['found_size']}, Found font name: {item['found_font_name']}" + font_suggestions.append(font_suggestion) + + missing_sections, sections_not_capitalized = self.extract_sections_from_resume(text) + + linkedin_urls_suggestion = str() + common_project = str() + if not name: + name_suggestion = "Please add name to the resume." + if not contact_number: + contact_suggestion = "Please add the contact number to the resume." + if not email: + email_suggestion = "Please add the email address to the resume." + if not github_urls: + github_urls_suggestions = "add the github_urls to the resume." + if not linkedin_urls: + linkedin_urls_suggestion = "add the linkedin_urls to the resume." + if found_projects: + common_project = "Common projects found in Projects section: " + for project in found_projects: + common_project += project + + # Replace the existing project length suggestion code with: + project_list = sections_text.get("PROJECTS", []) + projects_with_description = [ + p for p in project_list + if "description" in p.lower() + ] + project_count = len(projects_with_description) + + if project_count == 0: + project_length_suggestion = "No projects found. Consider at least 2 projects." + elif project_count == 1: + project_length_suggestion = "Only 1 project found. Consider adding 1 more project." + else: + project_length_suggestion = f"{project_count} projects found." + + # Store in resume data (keeps your existing URL extraction) + resume_data["project_length_suggestion"] = project_length_suggestion + + experience_text = sections_text.get("WORK EXPERIENCE", "") # ✅ Extract work experience section + relevant_experience_score = self.calculate_relevant_experience_score(experience_text) # ✅ Calculate score + + # ✅ Store in the final resume data output + resume_data["relevant_experience_score"] = relevant_experience_score + + + recommended_blogs = random.sample(blog_articles, 2) + recommended_youtube = random.sample(youtube_links, 2) + + # Calculate imarticus_score + imarticus_score = self.imarticus_review_score( + name, + contact_number, + email, + linkedin_urls, + github_urls, + missing_sections, + sections_not_capitalized, + common_projects=found_projects, # Ensure to pass found projects + section_order_suggestion=experience_order_suggestion, + skills=skills_found, # Pass order suggestion + sections_text=sections_text, + relevant_experience_score=relevant_experience_score, + # project_link_score=project_link_score + #pdf_path=path + #relevant_keywords_found=bool(found_keywords), # Convert to boolean + #experience_orderly_arranged=experience_order_suggestion, # Pass orderly arrangement check + #experience_section_present="WORK EXPERIENCE" in sections_text # Check if experience section is present + ) + + # Populate resume data dictionary + resume_data = { + "name": name, + "contact_number": contact_number, + "email": email, + "linkedin_urls": linkedin_urls, + "experience_order_suggestion": experience_order_suggestion, + "education_order_suggestion": education_order_suggestion, + "grammer_issues_by_section": section_by_grammer_issues, + "github_urls": github_urls, + "skills": skills_found, + "spelling_suggestions": spelling_suggestions, + "found_keywords": found_keywords, + "text": text, + "font_suggestions": font_suggestions, + "name_suggestion": name_suggestion, + "contact_suggestion": contact_suggestion, + "email_suggestion": email_suggestion, + "imarticus_score": imarticus_score, + "github_urls_suggestions": github_urls_suggestions, + "linkedin_urls_suggestion": "Add the LinkedIn URLs to the resume." if not linkedin_urls else "", + "missing_sections": missing_sections, + "common_projects": "Common projects found in Projects section: " + ", ".join(found_projects) if found_projects else "", + "project_length_suggestion": project_length_suggestion, + "extra_urls": extra_urls, + "certifications": { + "found": found_imarticus_certification["found"], + "message": found_imarticus_certification["message"], + "text": certifications_text # Store extracted certification text + }, + "recommended_blogs": recommended_blogs, + "recommended_youtube_links": recommended_youtube + } + + # Additional checks and data additions + if "WORK EXPERIENCE" in sections_text.keys() and "WORK EXPERIENCE" != list(sections_text.keys())[2]: + section_order_suggestion = f"WORK EXPERIENCE should come before {list(sections_text.keys())[2]}" + resume_data["section_order_suggestion"] = section_order_suggestion + + missing_important_sections = self.check_missing_sections(resume_data) + resume_data["basic_information_section"] = missing_important_sections or "Basic information is Found" + + missing_skills = list(set(essential_skills) - set(skills_found)) + resume_data["missing_skills"] = missing_skills + + found_keywords_count = len(resume_data["found_keywords"]) + num_keywords = len(keyword_variations) + quality_mapping = {"Low": 0.2, "Medium": 0.5, "High": 0.8} # Assuming some quality mapping + for quality, threshold in quality_mapping.items(): + if found_keywords_count < num_keywords * threshold: + resume_data["quality"] = quality + break + + found_certification = "Imarticus certification found in Certifications section." if found_imarticus_certification else "No Imarticus certification found in Certifications section." + resume_data["found_certification"] = found_certification + + # Experience relevance check + Extract_exp_sections = ['WORK EXPERIENCE'] + experience_text = self.extract_and_format_sections(sections_text, Extract_exp_sections) + if experience_text: + resume_data["work_experience_check"] = "Experience is relevant to Data science." if any(variation.lower() in experience_text.lower() for keyword, variations in keyword_variations.items() for variation in variations) else "Experience is not relevant to Data science." + + return jsonify(resume_data)