Spaces:

Imarticuslearning
/

resume_parser

Runtime error

File size: 52,818 Bytes

227f173

# python file to parse different section from resume
from pdfminer.high_level import extract_pages, extract_text
from pdfminer.layout import LTTextContainer, LTChar, LTTextLineHorizontal
from collections import defaultdict
from flask import jsonify
import re, fitz, requests, logging, datetime
from .config import data_science_skills, keyword_variations, essential_skills, quality_mapping, Extract_sections, suggested_projects, ignore_rule_ids
from .config import required_sections, linkedin_domain, github_domain, basic_informations, section_headers, common_projects, ignore_error_keywords,blog_articles,youtube_links
from .config import kaggle_domain,hackerrank_domain,leetcode_domain,medium_domain
from spacy.matcher import Matcher
import language_tool_python
from collections import defaultdict
import random
tool = language_tool_python.LanguageTool('en-US')



class ResumeParser:
    
    def extract_contact_number_from_resume(self, text):
        contact_number = None
        suggestion = ""

        # Use regex pattern to find a potential contact number
        pattern = r"\b(?:\+?\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b"
        match = re.search(pattern, text)
        if match:
            contact_number = match.group()
            # Check if the contact number is of the correct length
            digits_only = re.sub(r'\D', '', contact_number)
            if len(digits_only) == 10:
                suggestion = ""
            elif len(digits_only) > 10 and digits_only.startswith('91') and len(digits_only[2:]) == 10:
                suggestion = ""
            else:
                suggestion = "Contact number should have exactly 10 digits."
        
        return contact_number, suggestion
    


    def extract_hyperlinks(self, pdf_path):
        doc = fitz.open(pdf_path)
        links = []

        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            link_list = page.get_links()
            for link in link_list:
                uri = link.get('uri', None)
                if uri:
                    links.append(uri)

        return links

    def extract_text_from_pdf(self, pdf_path):
        return extract_text(pdf_path)
    
    def extract_email_from_text(self, text):
        pattern = r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b"
        match = re.search(pattern, text)
        if match:
            return match.group()
        return None

    def extract_email_from_resume(self, pdf_path):
        text = self.extract_text_from_pdf(pdf_path)
        email = self.extract_email_from_text(text)
        suggestion = ""

        # If no email found in text, check hyperlinks
        if not email:
            links = self.extract_hyperlinks(pdf_path)
            for link in links:
                if link.startswith('mailto:'):
                    email_candidate = link.split('mailto:')[1]
                    if self.is_valid_email(email_candidate):
                        email = email_candidate
                        break

        # Additional validation for email found in text or links
        if email and not self.is_valid_email(email):
            suggestion += "Your email address doesn't seem to be valid. Please check and correct."

        return email, suggestion
    
    
    def is_valid_email(self, email):
        # Length check
        if len(email) > 254:
            return False
        
        # Consecutive special characters check
        if re.search(r"[._%+-]{2,}", email):
            return False
        
        # Domain part validation
        domain_part = email.split('@')[1]
        if not re.match(r"[A-Za-z0-9.-]+\.[A-Za-z]{2,}", domain_part):
            return False
        
        # Standard email format check
        pattern = r"^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$"
        return re.match(pattern, email) is not None
    
    
    def extract_sections_from_resume(self, text):
        missing_sections = []
        sections_not_capitalized = []

        for section in required_sections:
            pattern = r"\b{}\b".format(re.escape(section))

            match_obj = re.search(pattern, text, re.IGNORECASE)
            if not match_obj:
                missing_sections.append(section)
            else:
                if match_obj.group() not in map(str.upper, required_sections):
                    sections_not_capitalized.append(section)

        return missing_sections, sections_not_capitalized
    
    def extract_skills_from_resume(self, text):
        if not isinstance(text, str):
            raise ValueError(f"Expected 'text' to be a string, but got {type(text)}")
    
        skills = []
        for skill in essential_skills:
            pattern = r"\b{}\b".format(re.escape(skill))
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                skills.append(skill) 
        return skills
    
    def extract_keyword_variations_from_resume(self, text):
        found_keywords = []
        for keyword, variations in keyword_variations.items():
            for variation in variations:
                if variation.lower() in text.lower(): 
                    found_keywords.append(variation)
                    break  

        return found_keywords
    
    def extract_keyword_variations_from_formatted_text(self, formatted_text):
        found_keyword_section = []
        for keyword, variations in keyword_variations.items():
            for variation in variations:
                if variation.lower() in formatted_text.lower(): 
                    found_keyword_section.append(variation)
                    break  

        return found_keyword_section
    
    def extract_linkedIn_urls_from_pdf(self, pdf_path):
        linkedin_urls = None
        pdf_document = fitz.open(pdf_path)
        for page_num in range(len(pdf_document)):
            page = pdf_document.load_page(page_num)
            links = page.get_links()
            for link in links:
                url = link.get('uri', '')
                if re.search(linkedin_domain, url):
                    linkedin_urls = url
        pdf_document.close()
        return linkedin_urls

    def extract_github_urls_from_pdf(self, pdf_path):
        github_urls = None
        pdf_document = fitz.open(pdf_path)
        for page_num in range(len(pdf_document)):
            page = pdf_document.load_page(page_num)
            links = page.get_links()
            for link in links:
                url = link.get('uri', '')
                if re.search(github_domain, url):
                    path = re.sub(github_domain, '', url)
                    parts = path.split('/')
                    if len(parts) == 1: 
                        github_urls = url
        pdf_document.close()
        return github_urls 
    

    def extract_extra_urls_pdf(self,pdf_path, domains):
        extracted_urls = defaultdict(set)
        try:
        # Open the PDF document
           pdf_document = fitz.open(pdf_path)

        # Iterate through all pages in the PDF
           for page_num in range(len(pdf_document)):
                page = pdf_document.load_page(page_num)
                links = page.get_links()

                for link in links:
                    url = link.get('uri', '')
                    if url:  # Ensure there's a URL
                        for domain in domains:
                            if re.search(domain, url, re.IGNORECASE):
                                 extracted_urls[domain].add(url)  # Add URL to the domain's set
        except Exception as e:
               print(f"Error processing PDF: {e}")
        finally:
              pdf_document.close()

        return {domain: list(urls) for domain, urls in extracted_urls.items()}
    
    def is_valid_url(self , github_urls ):
        suggest = ""
        for _ in [github_urls]:  
            if not github_urls:
                break
                                    
            try:
                response = requests.head(github_urls)
                if response.status_code != 200:
                    suggest = "GitHub URL is not valid, please check and correct. "
            except requests.RequestException:
                    suggest = "GitHub URL is not valid, please check and correct. "
                       
            return suggest
        return suggest        
        

    def is_valid_name(self, name):
        if any(char.isdigit() for char in name):
            return False
        if len(name.split()) > 3: 
            return False
        common_non_names = {"Email", "Github", "LinkedIn", "Portfolio", "Data Analyst"}
        if name in common_non_names:
            return False
        return True
          
    def extract_name(self, resume_text):
        
        lines = resume_text.split('\n')
                
        # Use regex to find lines that likely contain names
        name_lines = [line for line in lines if re.match(r'^[A-Za-z]*\s[A-Za-z]*', line.strip())]

        names = []
        for i in range(len(name_lines)):
            if self.is_valid_name(name_lines[i].strip()):
                names.append(name_lines[i].strip())
                
        if len(names) >= 1:
            name = names[0]
            suggestion = ""
            # Check if the name parts contain only alphabetic characters
            name_parts = name.split()
            if any(part[0].islower() for part in name_parts):
                suggestion += " name should start with a capital letter. "
            return name, suggestion

        return None, "No valid name found"
 
    
    def check_missing_sections(self, resume_data):
        missing_information = []
        for section in basic_informations:
            if not resume_data.get(section):
                missing_information.append(section)
        return missing_information
        
    def segregate_sections(self, text):
        header_pattern = re.compile(rf'^\s*({"|".join(re.escape(header) for header in section_headers)}):?\s*$', re.IGNORECASE)
        sections_text = {}
        current_section = None
        lines = text.splitlines()
        for line in lines:
            clean_line = line.strip()
            match = header_pattern.match(clean_line)
            if match:
                current_section = match.group(1).upper()
                sections_text[current_section] = []
            elif current_section:
                sections_text[current_section].append(line.strip())
        
        return sections_text
        
    def extract_and_format_sections(self, sections_text, Extract_sections):
        formatted_text = ""
        for section in Extract_sections:
            if section in sections_text:
                section_content = " ".join(sections_text[section]).replace('\n', ' ')
                formatted_text += f"{section}:\n{section_content}\n\n"
        return formatted_text
    
    def replace_keywords_with_placeholders(self, formatted_text, found_keyword_section):
        placeholder_text = formatted_text
        keyword_placeholders = {}
        
        # Use a set to avoid duplicates and keep track of keyword placeholders
        used_keywords = set()
        for i, keyword in enumerate(found_keyword_section):
            if keyword not in used_keywords:
                used_keywords.add(keyword)
                placeholder = f"{{KEYWORD_{i}}}"
                keyword_placeholders[placeholder] = keyword
                # Using word boundary to match whole words
                placeholder_text = re.sub(r'\b' + re.escape(keyword) + r'\b', placeholder, placeholder_text, flags=re.IGNORECASE)
                
        return placeholder_text, keyword_placeholders
    
    def replace_placeholders_with_keywords(self, grammar_issues, keyword_placeholders):
        updated_issues = []
        for issue in grammar_issues:
            context = issue['context']
            for placeholder, keyword in keyword_placeholders.items():
                context = context.replace(placeholder, keyword)
            # Update the context in the issue dictionary
            issue['context'] = context
            updated_issues.append(issue)
        return updated_issues

    def grammar_check(self, placeholder_text):
        matches = tool.check(placeholder_text)
        grammar_issues = []
        for match in matches:
            issue = {
                "context": match.context, 
                "error": match.message,
                "rule_id": match.ruleId,
                "suggested_correction": match.replacements
            }
            grammar_issues.append(issue)
        return grammar_issues
    
    def filter_grammar_issues(self, grammar_issues, ignore_rule_ids=None, ignore_error_keywords=None):
        if ignore_rule_ids is None:
            ignore_rule_ids = []
        if ignore_error_keywords is None:
            ignore_error_keywords = []

        filtered_issues = []
        for issue in grammar_issues:
            if issue['rule_id'] not in ignore_rule_ids and not any(keyword in issue['error'] for keyword in ignore_error_keywords):
                filtered_issues.append(issue)
        
        return filtered_issues

    def process_resume(self, text, found_keyword_section, Extract_sections):
        sections_text = self.segregate_sections(text)
        formatted_text = self.extract_and_format_sections(sections_text, Extract_sections)
        found_keyword_section  = self.extract_keyword_variations_from_formatted_text(formatted_text)
        placeholder_text, keyword_placeholders = self.replace_keywords_with_placeholders(formatted_text, found_keyword_section)
        grammar_issues = self.grammar_check(placeholder_text)
        grammar_issues_text = self.replace_placeholders_with_keywords(grammar_issues, keyword_placeholders)
        filtered_grammar_issues = self.filter_grammar_issues(grammar_issues, ignore_rule_ids, ignore_error_keywords)
        return filtered_grammar_issues

    def grammar_issue_check(self, text, found_keyword_section, Extract_sections):
        issues = {}
        text1 = " ".join(text.split("\n"))
        for section in Extract_sections:
            grammar_issues = self.process_resume(text, found_keyword_section, [section])
            if not grammar_issues:
                grammar_issues = "no error found"
            issues[section] = grammar_issues
        return issues
    
    def normalize_font_name(self,font_name):
        if '-' in font_name:
            font_name = font_name.split('-')[0]
        if '+' in font_name:
            font_name = font_name.split('+')[1]
        return font_name

    
    def extract_text_properties(self, pdf_path, predefined_terms):
        text_properties = []
        current_phrase = ""
        current_font_size = None
        current_font_name = None
        current_page_num = None

        special_characters = set("●▪•!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~")

        def add_current_phrase():
            nonlocal current_phrase
            if current_phrase.strip():
                flag = any(current_phrase in term for term in predefined_terms)
                if not flag:
                    text_properties.append({
                        "text": current_phrase,
                        "font_size": current_font_size,
                        "font_name": current_font_name,
                        "page_num": current_page_num
                    })
                current_phrase = ""

        for page_layout in extract_pages(pdf_path):
            for element in page_layout:
                if isinstance(element, LTTextContainer):
                    for text_line in element:
                        if isinstance(text_line, LTTextLineHorizontal):
                            for character in text_line:
                                if isinstance(character, LTChar):
                                    text = character.get_text()
                                    font_size = round(character.size, 2)
                                    font_name = self.normalize_font_name(character.fontname)
                                    page_num = page_layout.pageid

                                    if text.isspace() or text in special_characters:
                                        add_current_phrase()
                                        continue

                                    if (font_size != current_font_size or font_name != current_font_name or
                                        page_num != current_page_num):
                                        add_current_phrase()
                                        current_font_size = font_size
                                        current_font_name = font_name
                                        current_page_num = page_num

                                    current_phrase += text

                            add_current_phrase()

        return text_properties
    
    def group_similar_fonts(self,text_properties, tolerance=0.5):
        grouped_properties = defaultdict(list)
        
        for prop in text_properties:
            rounded_size = round(prop["font_size"] / tolerance) * tolerance
            key = (prop["font_name"], rounded_size)
            grouped_properties[key].append(prop)

        return grouped_properties
    



    def identify_different_fonts_and_sizes(self, grouped_properties):
        most_common_group = max(grouped_properties.values(), key=len)
        most_common_key = None
        for key, group in grouped_properties.items():
            if group == most_common_group:
                most_common_key = key
                break

        different_texts = []

        for key, group in grouped_properties.items():
            if group != most_common_group:
                for prop in group:
                    reason = []
                    if key[1] != most_common_key[1]:
                        reason.append(f"size not {most_common_key[1]}")
                    if key[0] != most_common_key[0]:
                        reason.append(f"font not {most_common_key[0]}")
                    different_texts.append({
                        "page_num": prop['page_num'],
                        "text": prop['text'],
                        "found_size": prop['font_size'],
                        "found_font_name": prop['font_name'],
                        "reason": ", ".join(reason)
                    })

        return different_texts
    
    def parse_dates(self, sections_text, section_name):
            # Check if the  section is in the text
        suggest = ""

        # Define the date patterns to match various date formats
        date_pattern = (
            r'\b\d{1,2}/\d{4}\b|'  # MM/YYYY
            r'\b(?:jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|jul(?:y)?|aug(?:ust)?|sep(?:tember)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?)\s+\d{4}\b|'  # Month YYYY
            r'\b(?:jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|jul(?:y)?|aug(?:ust)?|sep(?:tember)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?)\s+\d{1,2},?\s*\d{4}\b|'  # Month DD, YYYY
            r'\b\d{4}\b|'  # YYYY
            r'\b(?:jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|jul(?:y)?|aug(?:ust)?|sep(?:tember)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?)[a-z]*/?\d{4}\b|'  # Month/YYYY
            r'\b(?:jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|jul(?:y)?|aug(?:ust)?|sep(?:tember)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?)[a-z]*\d{4}\s*-\s*(?:jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|jul(?:y)?|aug(?:ust)?|sep(?:tember)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?)[a-z]*\d{4}\b'  # Month/YYYY - Month/YYYY
        )

        all_dates = []

        # Iterate over the entries in the section_name
        for entry in sections_text[section_name]:
            entry = entry.lower()
            matches = re.findall(date_pattern, entry)
            if matches and len(matches)>1:
                if len(matches) == 2:
                    all_dates.append(f"{matches[0]} {matches[1]}")
                else:
                    all_dates.extend(matches)

        return all_dates


    def convert_to_date(self, date_str):
        # Mapping of month names and abbreviations to their numeric equivalents
        month_map = {
                'jan': 1, 'january': 1, 'feb': 2, 'february': 2,
                'mar': 3, 'march': 3, 'apr': 4, 'april': 4,
                'may': 5, 'jun': 6, 'june': 6, 'jul': 7,
                'july': 7, 'aug': 8, 'august': 8, 'sep': 9,
                'september': 9, 'oct': 10, 'october': 10,
                'nov': 11, 'november': 11, 'dec': 12, 'december': 12,
                '01': 1, '02': 2, '03': 3, '04': 4,
                '05': 5, '06': 6, '07': 7, '08': 8,
                '09': 9, '10': 10, '11': 11, '12': 12
            }

        # Regex patterns to match different date formats
        pattern_mm_yyyy = re.compile(r'(\d{1,2})/(\d{4})')
        pattern_mm_yyyy_space = re.compile(r'(\d{1,2})\s(\d{4})')
        pattern_month_yyyy = re.compile(r'([a-zA-Z]+)\s?(\d{4})')
        pattern_yyyy = re.compile(r'(\d{4})')

        def extract_date(date_str):
            match_mm_yyyy = pattern_mm_yyyy.match(date_str)
            match_mm_yyyy_space = pattern_mm_yyyy_space.match(date_str)
            match_month_yyyy = pattern_month_yyyy.match(date_str)
            match_yyyy = pattern_yyyy.match(date_str)

            if match_mm_yyyy:
                month = int(match_mm_yyyy.group(1))
                year = int(match_mm_yyyy.group(2))
            elif match_mm_yyyy_space:
                month = int(match_mm_yyyy_space.group(1))
                year = int(match_mm_yyyy_space.group(2))
            elif match_month_yyyy:
                month = month_map.get(match_month_yyyy.group(1).lower())
                year = int(match_month_yyyy.group(2))
            elif match_yyyy:
                month = 1
                year = int(match_yyyy.group(1))
            else:
                return []

            return datetime.date(year, month, 1)

        date_parts = re.findall(r'(\d{4}\s[a-zA-Z]+\s?|\d{4}[a-zA-Z]+|\d{4}\/\d{2}|\d{4}\s\d{2}|[a-zA-Z]+\s?\d{4}|\d{4}\s[a-zA-Z]+)', date_str)
        if len(date_parts) == 1:
            # Standalone year or single date
            start_date = extract_date(date_parts[0])
            end_date = datetime.date(start_date.year, start_date.month, start_date.day)
        elif len(date_parts) == 2:
            # Date range
            start_date = extract_date(date_parts[0])
            end_date = extract_date(date_parts[1])
        else:
            return []

        return start_date, end_date


    def date_time(self, date_parts):
        converted_dates = []
        for date_part in date_parts:
                start_date, end_date = self.convert_to_date(date_part)
                converted_dates.append((start_date, end_date))
        return converted_dates  
    

    def check_chronological_order(self, converted_dates, section_name ):
        suggestion = ""
        sorted_dates = sorted(converted_dates, key=lambda x: (x[1], x[0]), reverse=True)
        if converted_dates == sorted_dates:
            suggestion = f"{section_name} section is in chronological order."
        else:
            suggestion = f"{section_name} section is not in chronological order."

        return suggestion
    
    def check_common_projects(self, projects_text):
        found_projects = []
        for project in common_projects:
            if project.lower() in projects_text.lower():
                found_projects.append(project)
        return found_projects
    
    def recommend_resources():
    # Randomly pick 2 blog articles and 2 YouTube links
         recommended_blogs = random.sample(blog_articles, 2)
         recommended_youtube = random.sample(youtube_links, 2)

    # Return the recommendations
         return {
        "Recommended Blogs": recommended_blogs,
        "Recommended YouTube Links": recommended_youtube
    }
    
    def check_imarticus_certifications(self, certifications_text):
    # Check if "imarticus" is present in the certifications text
        if "imarticus" in certifications_text.lower():
            return {
               "found": True,
               "message": "Imarticus certification found. Please upload it in the academic section."
             }
        return {
           "found": False,
           "message": "No Imarticus certification found in the provided text."
            }


    def chronological_order_check(self, sections_text, section_name):
        order_suggestion = ""
        suggestion = ""   
        section_name = section_name.upper()
        if section_name in sections_text:
            date = self.parse_dates(sections_text, section_name)
            if date:
                converted_dates = self.date_time(date) 
                order_suggestion = self.check_chronological_order(converted_dates, section_name)
            else:
                suggestion = f"No valid dates found in {section_name} section. "
        else:
            suggestion = f"{section_name} is not in section header. "
        
        return order_suggestion, suggestion

    

    # Function to check for spelling mistakes 
    def check_spelling(self, headers, section_headers):
        suggestions = []
        for header in headers:
            if header.upper() not in map(str.upper, section_headers):
                suggestions = header
        return suggestions

    def is_present_name(name):
       """

         Checks if a given name has at least 2 words.



       Args:

         name: The name string to check.



       Returns:

          True if it has at least 2 words, false otherwise.

       """
       parts = name.split()
       return len(parts) >= 2

    def is_sentence_case(name):

        parts = name.split()  # Split into individual words
        for part in parts:
            if not part:  # handles empty strings in name
                continue
            if not part[0].isupper() or not part[1:].islower():
                return False  # Check if first letter is uppercase and rest are lowercase
        return True

    def is_present_name(self,name):
        parts = name.split()
        return len(parts) >= 2

    def is_sentence_case(self,name):
        parts = name.split()
        for part in parts:
            if not part:
                continue
            if not part[0].isupper() or not part[1:].islower():
                return False
        return True
    
    def extract_project_links(self,sections_text):
        project_links = {}

        if "PROJECTS" in sections_text:
            project_list = sections_text.get("PROJECTS", [])
            url_pattern = r"https?://[^\s]+"
            for project in project_list:
                links = re.findall(url_pattern,project)
                if links:
                    project_links[project] = links
        return project_links
            
    def count_sentences(self,text):
        sentence_endings = r"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|!)\s"
        sentences = re.split(sentence_endings, text)
        sentences = [s.strip() for s in sentences if s.strip()]
        return len(sentences)
    
    def calculate_summary_score(self,summary):
        if not summary:
            score+=0
        
        num_sentences = self.count_sentences(summary)
        if num_sentences<=4:
            return 3
        elif num_sentences>4:
            return 1
        else:
            return 0

    def calculate_extra_urls_bonus(self,pdf_path):
        domains = [
            r"hackerrank\.com",  # Hackerrank
            r"leetcode\.com",    # LeetCode
            r"medium\.com"       # Medium
        ]
        extra_urls = self.extract_extra_urls_pdf(pdf_path, domains)
        has_extra_urls = any(urls for urls in extra_urls.values())
        return 5 if has_extra_urls else 0
    
    def calculate_relevant_experience_score(self, experience_text):
        """

        Assigns a score based on the presence of relevant experience keywords.



        Args:

            experience_text (str): The extracted work experience section text.



        Returns:

            int: A score of 5 if relevant keywords are found, otherwise 0.

        """
        if not experience_text:
            return 0  # ✅ No experience section → Score 0

        if isinstance(experience_text, list):
            experience_text = " ".join(experience_text)  # ✅ Convert list to a single string
        
        experience_text = experience_text.strip().lower()  # ✅ Ensure it's a string and lowercase

        # ✅ Check if any keyword from 'data_science_skills' or 'essential_skills' exists
        for skill in data_science_skills + essential_skills:
            if skill.lower() in experience_text:
                return 5  # ✅ Found relevant experience → Full score

        return 0
    
    def calculate_ds_skills_score(self, skills_present):
        if not skills_present:  # No skills found at all
            return 0
        
        # Use skills from config instead of hardcoded list
        ds_skills_list_lower = [skill.lower() for skill in data_science_skills]
        skills_present_lower = [skill.lower() for skill in skills_present]

        matching_count = sum(1 for skill in skills_present_lower 
                            if skill in ds_skills_list_lower)

        if matching_count == 0:  # Skills found but none match DS list
            return 2
        elif 1 <= matching_count <= 5:
            return 3
        elif matching_count > 5:
            return 5
        return 0
    
    def calculate_project_link_score(self, projects_with_links):    
        """

        Assigns a score based on whether project links are present.



        Args:

            projects_with_links (int): The number of projects with links.



        Returns:

            int: 2 if project links are found, otherwise 0.

        """
        return 2 if projects_with_links > 0 else 0
    

    def imarticus_review_score(self,name,contact_number,email,linkedin_urls,github_url,missing_sections,sections_not_capitalized,common_projects,section_order_suggestion,sections_text,skills,relevant_experience_score):
        score = 0
        if name:
            name_parts = name.split()
            num_parts = len(name_parts)

            if num_parts == 0:
                score += 0
            if self.is_sentence_case(name):
                score += 3
            elif self.is_present_name(name):
                score += 1.5
        
        if contact_number and isinstance(contact_number, str):
            digits_only = re.sub(r'\D', '', contact_number)

            if digits_only.startswith("91") and len(digits_only) > 10:
                digits_only = digits_only[2:]  # Remove the first two characters ('91')

            if len(digits_only) == 10 and digits_only[0] in "6789":  # Check for valid Indian mobile numbers
                score += 3

        if email:
            score += 3 if self.is_valid_email(email) else 0

        score += 3 if linkedin_urls else 0

        if github_url:
            github_suggestion = self.is_valid_url(github_url)
            score += 3 if not github_suggestion else 0
        else:
            score += 0

        if len(missing_sections)==0 and len(sections_not_capitalized)==0:
            score+=10
        elif len(missing_sections)==0 and len(sections_not_capitalized)>0:
            score+=8
        elif len(missing_sections)<=3:
            score+=6
        elif len(missing_sections)>4:
            score+=3

        if common_projects:
            score +=0
        else:
            score +=5

        if section_order_suggestion:
            score -= 2
        else:
            score
        
        """

        ds_skills_list_lower = [skill.lower() for skill in data_science_skills]

        skills_present_lower = [skill.lower() for skill in self.extract_skills_from_resume(skills) ]



        matching_skill_count = 0

        for skill in skills_present_lower:

            if ds_skills_list_lower:

                matching_skill_count+=1

        if  matching_skill_count==0:

            score+=0

        

        if matching_skill_count<=5:

            score+=2

        elif matching_skill_count>=10 and matching_skill_count<=15:

            score+5

        else:

            score+=8

       """
        
        if "PROJECTS" not in sections_text:
            score+=0
        else:
            project_list = sections_text.get("PROJECTS",[])
            project_count = len([x for x in project_list if "Description" in x])

            if project_count<=2:
                score+=2
            elif project_count>2 and project_count<=4:
                score+=5
            elif project_count>4:
                score+=3
        """

        project_links = self.extract_project_links(sections_text)

        total_projects = len(sections_text.get("PROJECTS", []))

        projects_with_links = len(project_links)



        if total_projects > 0:

            if projects_with_links == 0:

                score+=0

            elif projects_with_links / total_projects >= 0.5:

                score += 1.5

            if projects_with_links == total_projects:

                score += 3

        """
        resume_data = {}        
        # Extract projects & links
        project_links = self.extract_project_links(sections_text)
        projects_with_links = len(project_links)

        # ✅ Count only projects with descriptions
        valid_projects = [
            p for p in sections_text.get("PROJECTS", []) if "description" in p.lower()
        ]
        total_projects = len(valid_projects)  # ✅ Count projects properly

        # ✅ Calculate project link score
        project_link_score = self.calculate_project_link_score(projects_with_links)
        resume_data["project_link_score"] = project_link_score

        # ✅ Prevent division by zero
        if total_projects > 0:
            if projects_with_links == 0:
                score += 0
            elif projects_with_links / total_projects >= 0.5:
                score += 1.5
            if projects_with_links == total_projects:
                score += 3
        else:
            score += 0  # ✅ Ensure no division error if no projects exist

        """

        profile_summary = sections_text.get("PROFILE SUMMARY", "")

        print(profile_summary)



        summary_score = self.calculate_summary_score(profile_summary)

        score += summary_score 

        """
        ds_skills_score = self.calculate_ds_skills_score(skills)
        score += ds_skills_score


        certifications = sections_text.get("CERTIFICATIONS & ACADEMIC ENDEAVOURS", [])
        num_certifications = len(certifications)

        if num_certifications==0:
            score+=0
        elif 0 < num_certifications <= 2:
            score+=3
        elif 2 < num_certifications <= 4:
            score+=5
        elif num_certifications>4:
            score+=7
            
        """

        extra_urls_bonus = self.calculate_extra_urls_bonus(pdf_path)

        score += extra_urls_bonus

        """

        score += relevant_experience_score

        score += project_link_score

        return score
    

    

    def imarticus_detailed_score(self, name, contact_number, email, linkedin_urls, github_url, 

                            missing_sections=None, sections_not_capitalized=None, common_projects=None, 

                            section_order_suggestion=None, sections_text=None, skills=None, 

                            relevant_experience_score=0):
        
        # Ensure lists and dictionaries have default values to avoid 'NoneType' errors
        missing_sections = missing_sections or []
        sections_not_capitalized = sections_not_capitalized or []
        common_projects = common_projects or []
        sections_text = sections_text or {}

        score_breakdown = {
            "name_score": 0,
            "contact_number_score": 0,
            "email_score": 0,
            "linkedin_url_score": 0,
            "github_url_score": 0,
            "missing_sections_score": 0,
            "common_projects_score": 0,
            "section_order_score": 0,
            "projects_score": 0,
            "certifications_score": 0,
            "relevant_experience_score": 0,
            "ds_skills_score": 0,
            "extra_urls_bonus": 0,
            "summary_score": 0,
            "project_link_score": 0
        }

    # ✅ Name Score (3 Points)
        if name:
            if self.is_sentence_case(name):
                score_breakdown["name_score"] = 3
            elif self.is_present_name(name):
                score_breakdown["name_score"] = 1.5
                

    # ✅ Contact Number Score (3 Points)
        if contact_number and isinstance(contact_number, str):
            digits_only = re.sub(r'\D', '', contact_number)
            if digits_only.startswith("91") and len(digits_only) > 10:
                digits_only = digits_only[2:]
            if len(digits_only) == 10 and digits_only[0] in "6789":  
                score_breakdown["contact_number_score"] = 3

    # ✅ Email Score (3 Points)
        score_breakdown["email_score"] = 3 if email and self.is_valid_email(email) else 0

    # ✅ LinkedIn URL Score (3 Points)
        score_breakdown["linkedin_url_score"] = 3 if linkedin_urls else 0

    # ✅ GitHub URL Score (3 Points)
        if github_url and self.is_valid_url(github_url):
            score_breakdown["github_url_score"] = 3

    # ✅ Missing Sections Score (10 Points)
        if not missing_sections and not sections_not_capitalized:
            score_breakdown["missing_sections_score"] = 10
        elif not missing_sections and sections_not_capitalized:
            score_breakdown["missing_sections_score"] = 8
        elif len(missing_sections) <= 3:
            score_breakdown["missing_sections_score"] = 6
        else:
            score_breakdown["missing_sections_score"] = 3

    # ✅ Common Projects Score (5 Points)
        score_breakdown["common_projects_score"] = 0 if common_projects else 5

    # ✅ Section Order Score (2 Points)
        score_breakdown["section_order_score"] = -2 if section_order_suggestion else 0
    
    # ✅ Projects Score (5 Points)
        if "PROJECTS" in sections_text:
            project_list = sections_text.get("PROJECTS", [])
            project_count = len([x for x in project_list if "Description" in x])
            if project_count <= 2:
                score_breakdown["projects_score"] = 2
            elif 2 < project_count <= 4:
                score_breakdown["projects_score"] = 5
            else:
                score_breakdown["projects_score"] = 3

    # ✅ Certifications Score (7 Points)
        certifications = sections_text.get("CERTIFICATIONS & ACADEMIC ENDEAVOURS", [])
        num_certifications = len(certifications)
        if num_certifications == 0:
            score_breakdown["certifications_score"] = 0
        elif 0 < num_certifications <= 2:
            score_breakdown["certifications_score"] = 3
        elif 2 < num_certifications <= 4:
            score_breakdown["certifications_score"] = 5
        else:
            score_breakdown["certifications_score"] = 7

    # ✅ Relevant Experience Score (5 Points)
        score_breakdown["relevant_experience_score"] = relevant_experience_score if relevant_experience_score is not None else 0

        # ✅ Data Science Skills Score (5 Points)
        score_breakdown["ds_skills_score"] = self.calculate_ds_skills_score(skills)

        # ✅ Extra URLs Bonus (5 Points)
        score_breakdown["extra_urls_bonus"] = self.calculate_extra_urls_bonus(sections_text)

        # ✅ Summary Score (5 Points)
        profile_summary = sections_text.get("PROFILE SUMMARY", "")
        score_breakdown["summary_score"] = self.calculate_summary_score(profile_summary)

        # ✅ Project Link Score (2 Points)
        project_links = self.extract_project_links(sections_text)
        projects_with_links = len(project_links)
        score_breakdown["project_link_score"] = self.calculate_project_link_score(projects_with_links)

        return score_breakdown
    
    def calculate_name_score(self,name):
        if not name:
            return 0
        
        name_parts = name.split()
        num_parts = len(name_parts)

        if num_parts == 0:
           return 0
        elif self.is_sentence_case(name):
           return 3
        elif self.is_present_name(name):
           return 1.5
        else:
           return 0
        

    def calculate_contact(self,contact_number):
        if contact_number and isinstance(contact_number, str):
            digits_only = re.sub(r'\D', '', contact_number)

            if digits_only.startswith("91") and len(digits_only) > 10:
                digits_only = digits_only[2:]  # Remove the first two characters ('91')

            if len(digits_only) == 10 and digits_only[0] in "6789":  # Check for valid Indian mobile numbers
                return 3
        else:
            return 0
        
    def calculate_email(self,email):
        if email:
            if self.is_valid_email(email):
                return 3
            else:
                return 0
            
    def calculate_github_url_score(self,github_url):
        if github_url:
            github_suggestion = self.is_valid_url(github_url)
            return 3 if not github_suggestion else 0
        return 0

    def parse_text(self, path):
        logger = logging.getLogger(__name__)
        logging.getLogger("pdfminer").setLevel(logging.WARNING)
        resume_data = {}
        logger.debug('parsing text')
        text = self.extract_text_from_pdf(path)
        text1 = " ".join(text.split("\n"))
        skills_found = self.extract_skills_from_resume(text)
        found_keywords = self.extract_keyword_variations_from_resume(text)
        sections_text = self.segregate_sections(text)
        formatted_text = self.extract_and_format_sections(sections_text, Extract_sections)
        found_keyword_section = self.extract_keyword_variations_from_formatted_text(formatted_text)

        parsed_sections = self.segregate_sections(text)
        projects = parsed_sections.get("PROJECTS", [])
        certifications = parsed_sections.get("CERTIFICATIONS & ACADEMIC ENDEAVOURS", [])
        projects_text = "\n".join(projects)
        certifications_text = "\n".join(certifications)
        found_imarticus_certification = self.check_imarticus_certifications(certifications_text)
        found_projects = self.check_common_projects(projects_text)

        name, name_suggestion = self.extract_name(text)
        contact_number, contact_suggestion = self.extract_contact_number_from_resume(text)
        email, email_suggestion = self.extract_email_from_resume(path)
        github_urls =  self.extract_github_urls_from_pdf(path)     
        github_urls_suggestions = self.is_valid_url(github_urls)
        linkedin_urls =  self.extract_linkedIn_urls_from_pdf(path)
        section_by_grammer_issues = self.grammar_issue_check(text, found_keyword_section, Extract_sections)
        

        domains = [
         r"hackerrank\.com",  # Hackerrank
         r"leetcode\.com",    # LeetCode
         r"medium\.com"       # Medium
          ]
        extra_urls = self.extract_extra_urls_pdf(path, domains)

        education_order_suggestion, education_suggestion = self.chronological_order_check(sections_text, "ACADEMIC PROFILE")
        experience_order_suggestion, experience_suggestion = self.chronological_order_check(sections_text, "WORK EXPERIENCE")   

        headers = list(sections_text.keys())        
        spelling_suggestions = self.check_spelling(headers, section_headers)

        predefined_terms = [name, email]
        predefined_terms.extend(required_sections)
        text_properties = self.extract_text_properties(path, predefined_terms)
        grouped_properties = self.group_similar_fonts(text_properties)
        different_texts = self.identify_different_fonts_and_sizes(grouped_properties)

        font_suggestions = []
        for item in different_texts:
            font_suggestion = f"Formatting issue at Page: {item['page_num']}, Text: {item['text']}, Reason: {item['reason']}, Found font size: {item['found_size']}, Found font name: {item['found_font_name']}"
            font_suggestions.append(font_suggestion)

        missing_sections, sections_not_capitalized = self.extract_sections_from_resume(text)

        linkedin_urls_suggestion = str()
        common_project = str()
        if not name:
            name_suggestion = "Please add  name to the resume."
        if not contact_number:
            contact_suggestion = "Please add the contact number to the resume."
        if not email:
            email_suggestion = "Please add the email address to the resume."
        if not github_urls:
            github_urls_suggestions = "Add the github_urls to the resume."
        if not linkedin_urls:
            linkedin_urls_suggestion = "Add the linkedin_urls to the resume."
        if found_projects:
            common_project = "Common projects found in Projects section: "
            for project in found_projects:
                common_project += project

        # Replace the existing project length suggestion code with:
        project_list = sections_text.get("PROJECTS", [])
        projects_with_description = [
            p for p in project_list 
            if "description" in p.lower()
        ]
        project_count = len(projects_with_description)

        if project_count == 0:
            project_length_suggestion = "No projects found. Consider at least 2 projects."
        elif project_count == 1:
            project_length_suggestion = "Only 1 project found. Consider adding 1 more project."
        else:
            project_length_suggestion = f"{project_count} projects found."

        # Store in resume data (keeps your existing URL extraction)
        resume_data["project_length_suggestion"] = project_length_suggestion
        
        experience_text = sections_text.get("WORK EXPERIENCE", "")  # ✅ Extract work experience section
        relevant_experience_score = self.calculate_relevant_experience_score(experience_text)  # ✅ Calculate score

        # ✅ Store in the final resume data output
        resume_data["relevant_experience_score"] = relevant_experience_score

        section_grammar_check_issues = self.grammar_check(sections_text.keys())

        recommended_blogs = random.sample(blog_articles, 2)
        recommended_youtube = random.sample(youtube_links, 2)

        name_score = self.calculate_name_score(name)
        
        contact_score = self.calculate_contact(contact_number)

        email_score = self.calculate_email(email)

        github_url_score = self.calculate_github_url_score(github_urls)

    # Calculate imarticus_score
        imarticus_score = self.imarticus_review_score(  
        name, 
        contact_number,
        email,
        linkedin_urls,
        github_urls,  
        missing_sections, 
        sections_not_capitalized,
        common_projects=found_projects,  # Ensure to pass found projects
        section_order_suggestion=experience_order_suggestion,
        sections_text=sections_text,
        skills=skills_found,
        relevant_experience_score=relevant_experience_score,
        #pdf_path=path
        #relevant_keywords_found=bool(found_keywords),  # Convert to boolean
        #experience_orderly_arranged=experience_order_suggestion,  # Pass orderly arrangement check
        #experience_section_present="WORK EXPERIENCE" in sections_text  # Check if experience section is present
    )
        

    
    # Populate resume data dictionary
        resume_data = {
        "name": name,
        "contact_number": contact_number,
        "email": email,
        "linkedin_urls": linkedin_urls,
        "experience_order_suggestion": experience_order_suggestion,
        "education_order_suggestion": education_order_suggestion,
        "grammer_issues_by_section": section_by_grammer_issues,
        "github_urls": github_urls,
        "skills": skills_found,
        "spelling_suggestions": spelling_suggestions,
        "found_keywords": found_keywords,
        "text": text,
        "font_suggestions": font_suggestions,
        "name_suggestion": name_suggestion,
        "contact_suggestion": contact_suggestion,
        "email_suggestion": email_suggestion,
        "github_urls_suggestions": github_urls_suggestions,
        "linkedin_urls_suggestion": "Add the LinkedIn URLs to the resume." if not linkedin_urls else "",
        "missing_sections": missing_sections,
        "common_projects": "Common projects found in Projects section: " + ", ".join(found_projects) if found_projects else "",
        "project_length_suggestion": project_length_suggestion,
        "section_grammar_check_issues": section_grammar_check_issues,
        "imarticus_score": imarticus_score,  # Add the score to resume data
        "extra_urls": extra_urls,
        "certifications": {
            "found": found_imarticus_certification["found"],
            "message": found_imarticus_certification["message"],
            "text": certifications_text  # Store extracted certification text
        },
        "recommended_blogs": recommended_blogs,  
        "recommended_youtube_links": recommended_youtube,
        "name_score":name_score,
        "contact_score":contact_score,
        "email_score":email_score,
        "github_urls_score":github_url_score

    }

    # Additional checks and data additions
        if "WORK EXPERIENCE" in sections_text.keys() and "WORK EXPERIENCE" != list(sections_text.keys())[2]:
            section_order_suggestion = f"WORK EXPERIENCE should come before {list(sections_text.keys())[2]}"
            resume_data["section_order_suggestion"] = section_order_suggestion

        missing_important_sections = self.check_missing_sections(resume_data)
        resume_data["basic_information_section"] = missing_important_sections or "Basic information is Found"

        missing_skills = list(set(essential_skills) - set(skills_found))
        resume_data["missing_skills"] = missing_skills

        found_keywords_count = len(resume_data["found_keywords"])
        num_keywords = len(keyword_variations)
        quality_mapping = {"Low": 0.2, "Medium": 0.5, "High": 0.8}  # Assuming some quality mapping
        for quality, threshold in quality_mapping.items():
            if found_keywords_count < num_keywords * threshold:
                resume_data["quality"] = quality
                break

        found_certification = "Imarticus certification found in Certifications section." if found_imarticus_certification else "No Imarticus certification found in Certifications section."
        resume_data["found_certification"] = found_certification

        # Experience relevance check
        Extract_exp_sections = ['WORK EXPERIENCE']
        experience_text = self.extract_and_format_sections(sections_text, Extract_exp_sections)
        if experience_text:
            resume_data["work_experience_check"] = "Experience is relevant to Data science." if any(variation.lower() in experience_text.lower() for keyword, variations in keyword_variations.items() for variation in variations) else "Experience is not relevant to Data science."

        return jsonify(resume_data)