Spaces:

Imarticuslearning
/

CV_Guru

Sleeping

App Files Files Community

CV_Guru / src /resume_parser.py

Imarticuslearning

Update src/resume_parser.py

bb24ed3 verified 10 months ago

raw

history blame contribute delete

49.6 kB

	#total score = 67
	# python file to parse different section from resume
	from pdfminer.high_level import extract_pages, extract_text
	from pdfminer.layout import LTTextContainer, LTChar, LTTextLineHorizontal
	from collections import defaultdict
	from flask import jsonify
	import re, fitz, requests, logging, datetime
	import src.config as config
	from .config import data_science_skills, keyword_variations, essential_skills, quality_mapping, Extract_sections, suggested_projects, ignore_rule_ids
	from .config import required_sections, linkedin_domain, github_domain, basic_informations, section_headers, common_projects, ignore_error_keywords,blog_articles,youtube_links
	from .config import kaggle_domain,hackerrank_domain,leetcode_domain,medium_domain
	from spacy.matcher import Matcher
	import language_tool_python
	from collections import defaultdict
	import random
	tool = language_tool_python.LanguageTool('en-US')



	class ResumeParser:

	def extract_contact_number_from_resume(self, text):
	contact_number = None
	suggestion = ""

	# Use regex pattern to find a potential contact number
	pattern = r"\b(?:\+?\d{1,3}[-.\s]?)?$?\d{3}$?[-.\s]?\d{3}[-.\s]?\d{4}\b"
	match = re.search(pattern, text)
	if match:
	contact_number = match.group()
	# Check if the contact number is of the correct length
	digits_only = re.sub(r'\D', '', contact_number)
	if len(digits_only) == 10:
	suggestion = ""
	elif len(digits_only) > 10 and digits_only.startswith('91') and len(digits_only[2:]) == 10:
	suggestion = ""
	else:
	suggestion = "Contact number should have exactly 10 digits."

	return contact_number, suggestion



	def extract_hyperlinks(self, pdf_path):
	doc = fitz.open(pdf_path)
	links = []

	for page_num in range(len(doc)):
	page = doc.load_page(page_num)
	link_list = page.get_links()
	for link in link_list:
	uri = link.get('uri', None)
	if uri:
	links.append(uri)

	return links

	def extract_text_from_pdf(self, pdf_path):
	return extract_text(pdf_path)

	def extract_email_from_text(self, text):
	pattern = r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b"
	match = re.search(pattern, text)
	if match:
	return match.group()
	return None

	def extract_email_from_resume(self, pdf_path):
	text = self.extract_text_from_pdf(pdf_path)
	email = self.extract_email_from_text(text)
	suggestion = ""

	# If no email found in text, check hyperlinks
	if not email:
	links = self.extract_hyperlinks(pdf_path)
	for link in links:
	if link.startswith('mailto:'):
	email_candidate = link.split('mailto:')[1]
	if self.is_valid_email(email_candidate):
	email = email_candidate
	break

	# Additional validation for email found in text or links
	if email and not self.is_valid_email(email):
	suggestion += "Your email address doesn't seem to be valid. Please check and correct."

	return email, suggestion


	def is_valid_email(self, email):
	# Length check
	if len(email) > 254:
	return False

	# Consecutive special characters check
	if re.search(r"[._%+-]{2,}", email):
	return False

	# Domain part validation
	domain_part = email.split('@')[1]
	if not re.match(r"[A-Za-z0-9.-]+\.[A-Za-z]{2,}", domain_part):
	return False

	# Standard email format check
	pattern = r"^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$"
	return re.match(pattern, email) is not None


	def extract_sections_from_resume(self, text):
	missing_sections = []
	sections_not_capitalized = []

	for section in required_sections:
	pattern = r"\b{}\b".format(re.escape(section))

	match_obj = re.search(pattern, text, re.IGNORECASE)
	if not match_obj:
	missing_sections.append(section)
	else:
	if match_obj.group() not in map(str.upper, required_sections):
	sections_not_capitalized.append(section)

	return missing_sections, sections_not_capitalized

	def extract_skills_from_resume(self, text):
	if not isinstance(text, str):
	raise ValueError(f"Expected 'text' to be a string, but got {type(text)}")

	skills = []
	for skill in essential_skills:
	pattern = r"\b{}\b".format(re.escape(skill))
	match = re.search(pattern, text, re.IGNORECASE)
	if match:
	skills.append(skill)
	return skills

	def extract_keyword_variations_from_resume(self, text):
	found_keywords = []
	for keyword, variations in keyword_variations.items():
	for variation in variations:
	if variation.lower() in text.lower():
	found_keywords.append(variation)
	break

	return found_keywords

	def extract_keyword_variations_from_formatted_text(self, formatted_text):
	found_keyword_section = []
	for keyword, variations in keyword_variations.items():
	for variation in variations:
	if variation.lower() in formatted_text.lower():
	found_keyword_section.append(variation)
	break

	return found_keyword_section

	def extract_linkedIn_urls_from_pdf(self, pdf_path):
	linkedin_urls = None
	pdf_document = fitz.open(pdf_path)
	for page_num in range(len(pdf_document)):
	page = pdf_document.load_page(page_num)
	links = page.get_links()
	for link in links:
	url = link.get('uri', '')
	if re.search(linkedin_domain, url):
	linkedin_urls = url
	pdf_document.close()
	return linkedin_urls

	def extract_github_urls_from_pdf(self, pdf_path):
	github_urls = None
	pdf_document = fitz.open(pdf_path)
	for page_num in range(len(pdf_document)):
	page = pdf_document.load_page(page_num)
	links = page.get_links()
	for link in links:
	url = link.get('uri', '')
	if re.search(github_domain, url):
	path = re.sub(github_domain, '', url)
	parts = path.split('/')
	if len(parts) == 1:
	github_urls = url
	pdf_document.close()
	return github_urls


	def extract_extra_urls_pdf(self,pdf_path, domains):
	extracted_urls = defaultdict(set)
	try:
	# Open the PDF document
	pdf_document = fitz.open(pdf_path)

	# Iterate through all pages in the PDF
	for page_num in range(len(pdf_document)):
	page = pdf_document.load_page(page_num)
	links = page.get_links()

	for link in links:
	url = link.get('uri', '')
	if url: # Ensure there's a URL
	for domain in domains:
	if re.search(domain, url, re.IGNORECASE):
	extracted_urls[domain].add(url) # Add URL to the domain's set
	except Exception as e:
	print(f"Error processing PDF: {e}")
	finally:
	pdf_document.close()

	return {domain: list(urls) for domain, urls in extracted_urls.items()}

	def is_valid_url(self , github_urls ):
	suggest = ""
	for _ in [github_urls]:
	if not github_urls:
	break

	try:
	response = requests.head(github_urls)
	if response.status_code != 200:
	suggest = "GitHub URL is not valid, please check and correct. "
	except requests.RequestException:
	suggest = "GitHub URL is not valid, please check and correct. "

	return suggest
	return suggest


	def is_valid_name(self, name):
	if any(char.isdigit() for char in name):
	return False
	if len(name.split()) > 3:
	return False
	common_non_names = {"Email", "Github", "LinkedIn", "Portfolio", "Data Analyst"}
	if name in common_non_names:
	return False
	return True

	def extract_name(self, resume_text):

	lines = resume_text.split('\n')

	# Use regex to find lines that likely contain names
	name_lines = [line for line in lines if re.match(r'^[A-Za-z]\s[A-Za-z]', line.strip())]

	names = []
	for i in range(len(name_lines)):
	if self.is_valid_name(name_lines[i].strip()):
	names.append(name_lines[i].strip())

	if len(names) >= 1:
	name = names[0]
	suggestion = ""
	# Check if the name parts contain only alphabetic characters
	name_parts = name.split()
	if any(part[0].islower() for part in name_parts):
	suggestion += " name should start with a capital letter. "
	return name, suggestion

	return None, "No valid name found"


	def check_missing_sections(self, resume_data):
	missing_information = []
	for section in basic_informations:
	if not resume_data.get(section):
	missing_information.append(section)
	return missing_information

	def segregate_sections(self, text):
	header_pattern = re.compile(rf'^\s({"\|".join(re.escape(header) for header in section_headers)}):?\s$', re.IGNORECASE)
	sections_text = {}
	current_section = None
	lines = text.splitlines()
	for line in lines:
	clean_line = line.strip()
	match = header_pattern.match(clean_line)
	if match:
	current_section = match.group(1).upper()
	sections_text[current_section] = []
	elif current_section:
	sections_text[current_section].append(line.strip())

	return sections_text

	def extract_and_format_sections(self, sections_text, Extract_sections):
	formatted_text = ""
	for section in Extract_sections:
	if section in sections_text:
	section_content = " ".join(sections_text[section]).replace('\n', ' ')
	formatted_text += f"{section}:\n{section_content}\n\n"
	return formatted_text

	def replace_keywords_with_placeholders(self, formatted_text, found_keyword_section):
	placeholder_text = formatted_text
	keyword_placeholders = {}

	# Use a set to avoid duplicates and keep track of keyword placeholders
	used_keywords = set()
	for i, keyword in enumerate(found_keyword_section):
	if keyword not in used_keywords:
	used_keywords.add(keyword)
	placeholder = f"{{KEYWORD_{i}}}"
	keyword_placeholders[placeholder] = keyword
	# Using word boundary to match whole words
	placeholder_text = re.sub(r'\b' + re.escape(keyword) + r'\b', placeholder, placeholder_text, flags=re.IGNORECASE)

	return placeholder_text, keyword_placeholders

	def replace_placeholders_with_keywords(self, grammar_issues, keyword_placeholders):
	updated_issues = []
	for issue in grammar_issues:
	context = issue['context']
	for placeholder, keyword in keyword_placeholders.items():
	context = context.replace(placeholder, keyword)
	# Update the context in the issue dictionary
	issue['context'] = context
	updated_issues.append(issue)
	return updated_issues

	def grammar_check(self, placeholder_text):
	matches = tool.check(placeholder_text)
	grammar_issues = []
	for match in matches:
	issue = {
	"context": match.context,
	"error": match.message,
	"rule_id": match.ruleId,
	"suggested_correction": match.replacements
	}
	grammar_issues.append(issue)
	return grammar_issues

	def filter_grammar_issues(self, grammar_issues, ignore_rule_ids=None, ignore_error_keywords=None):
	if ignore_rule_ids is None:
	ignore_rule_ids = []
	if ignore_error_keywords is None:
	ignore_error_keywords = []

	filtered_issues = []
	for issue in grammar_issues:
	if issue['rule_id'] not in ignore_rule_ids and not any(keyword in issue['error'] for keyword in ignore_error_keywords):
	filtered_issues.append(issue)

	return filtered_issues

	def process_resume(self, text, found_keyword_section, Extract_sections):
	sections_text = self.segregate_sections(text)
	formatted_text = self.extract_and_format_sections(sections_text, Extract_sections)
	found_keyword_section = self.extract_keyword_variations_from_formatted_text(formatted_text)
	placeholder_text, keyword_placeholders = self.replace_keywords_with_placeholders(formatted_text, found_keyword_section)
	grammar_issues = self.grammar_check(placeholder_text)
	grammar_issues_text = self.replace_placeholders_with_keywords(grammar_issues, keyword_placeholders)
	filtered_grammar_issues = self.filter_grammar_issues(grammar_issues, ignore_rule_ids, ignore_error_keywords)
	return {
	"grammar_issues": filtered_grammar_issues,
	"spelling_errors": [issue for issue in filtered_grammar_issues if "SPELLING" in issue['rule_id']]
	}

	def grammar_issue_check(self, text, found_keyword_section, Extract_sections):
	issues = {}
	text1 = " ".join(text.split("\n"))
	for section in Extract_sections:
	grammar_issues = self.process_resume(text, found_keyword_section, [section])
	if not grammar_issues:
	grammar_issues = "no error found"
	issues[section] = grammar_issues
	return issues

	def normalize_font_name(self,font_name):
	if '-' in font_name:
	font_name = font_name.split('-')[0]
	if '+' in font_name:
	font_name = font_name.split('+')[1]
	return font_name


	def extract_text_properties(self, pdf_path, predefined_terms):
	text_properties = []
	current_phrase = ""
	current_font_size = None
	current_font_name = None
	current_page_num = None

	special_characters = set("●▪•!\"#$%&'()*+,-./:;<=>?@[\\]^_`{\|}~")

	def add_current_phrase():
	nonlocal current_phrase
	if current_phrase.strip():
	flag = any(current_phrase in term for term in predefined_terms)
	if not flag:
	text_properties.append({
	"text": current_phrase,
	"font_size": current_font_size,
	"font_name": current_font_name,
	"page_num": current_page_num
	})
	current_phrase = ""

	for page_layout in extract_pages(pdf_path):
	for element in page_layout:
	if isinstance(element, LTTextContainer):
	for text_line in element:
	if isinstance(text_line, LTTextLineHorizontal):
	for character in text_line:
	if isinstance(character, LTChar):
	text = character.get_text()
	font_size = round(character.size, 2)
	font_name = self.normalize_font_name(character.fontname)
	page_num = page_layout.pageid

	if text.isspace() or text in special_characters:
	add_current_phrase()
	continue

	if (font_size != current_font_size or font_name != current_font_name or
	page_num != current_page_num):
	add_current_phrase()
	current_font_size = font_size
	current_font_name = font_name
	current_page_num = page_num

	current_phrase += text

	add_current_phrase()

	return text_properties

	def group_similar_fonts(self,text_properties, tolerance=0.5):
	grouped_properties = defaultdict(list)

	for prop in text_properties:
	rounded_size = round(prop["font_size"] / tolerance) * tolerance
	key = (prop["font_name"], rounded_size)
	grouped_properties[key].append(prop)

	return grouped_properties




	def identify_different_fonts_and_sizes(self, grouped_properties):
	most_common_group = max(grouped_properties.values(), key=len)
	most_common_key = None
	for key, group in grouped_properties.items():
	if group == most_common_group:
	most_common_key = key
	break

	different_texts = []

	for key, group in grouped_properties.items():
	if group != most_common_group:
	for prop in group:
	reason = []
	if key[1] != most_common_key[1]:
	reason.append(f"size not {most_common_key[1]}")
	if key[0] != most_common_key[0]:
	reason.append(f"font not {most_common_key[0]}")
	different_texts.append({
	"page_num": prop['page_num'],
	"text": prop['text'],
	"found_size": prop['font_size'],
	"found_font_name": prop['font_name'],
	"reason": ", ".join(reason)
	})

	return different_texts

	def parse_dates(self, sections_text, section_name):
	# Check if the section is in the text
	suggest = ""

	# Define the date patterns to match various date formats
	date_pattern = (
	r'\b\d{1,2}/\d{4}\b\|' # MM/YYYY
	r'\b(?:jan(?:uary)?\|feb(?:ruary)?\|mar(?:ch)?\|apr(?:il)?\|may\|jun(?:e)?\|jul(?:y)?\|aug(?:ust)?\|sep(?:tember)?\|oct(?:ober)?\|nov(?:ember)?\|dec(?:ember)?)\s+\d{4}\b\|' # Month YYYY
	r'\b(?:jan(?:uary)?\|feb(?:ruary)?\|mar(?:ch)?\|apr(?:il)?\|may\|jun(?:e)?\|jul(?:y)?\|aug(?:ust)?\|sep(?:tember)?\|oct(?:ober)?\|nov(?:ember)?\|dec(?:ember)?)\s+\d{1,2},?\s*\d{4}\b\|' # Month DD, YYYY
	r'\b\d{4}\b\|' # YYYY
	r'\b(?:jan(?:uary)?\|feb(?:ruary)?\|mar(?:ch)?\|apr(?:il)?\|may\|jun(?:e)?\|jul(?:y)?\|aug(?:ust)?\|sep(?:tember)?\|oct(?:ober)?\|nov(?:ember)?\|dec(?:ember)?)[a-z]*/?\d{4}\b\|' # Month/YYYY
	r'\b(?:jan(?:uary)?\|feb(?:ruary)?\|mar(?:ch)?\|apr(?:il)?\|may\|jun(?:e)?\|jul(?:y)?\|aug(?:ust)?\|sep(?:tember)?\|oct(?:ober)?\|nov(?:ember)?\|dec(?:ember)?)[a-z]\d{4}\s-\s(?:jan(?:uary)?\|feb(?:ruary)?\|mar(?:ch)?\|apr(?:il)?\|may\|jun(?:e)?\|jul(?:y)?\|aug(?:ust)?\|sep(?:tember)?\|oct(?:ober)?\|nov(?:ember)?\|dec(?:ember)?)[a-z]\d{4}\b' # Month/YYYY - Month/YYYY
	)

	all_dates = []

	# Iterate over the entries in the section_name
	for entry in sections_text[section_name]:
	entry = entry.lower()
	matches = re.findall(date_pattern, entry)
	if matches and len(matches)>1:
	if len(matches) == 2:
	all_dates.append(f"{matches[0]} {matches[1]}")
	else:
	all_dates.extend(matches)

	return all_dates


	def convert_to_date(self, date_str):
	# Mapping of month names and abbreviations to their numeric equivalents
	month_map = {
	'jan': 1, 'january': 1, 'feb': 2, 'february': 2,
	'mar': 3, 'march': 3, 'apr': 4, 'april': 4,
	'may': 5, 'jun': 6, 'june': 6, 'jul': 7,
	'july': 7, 'aug': 8, 'august': 8, 'sep': 9,
	'september': 9, 'oct': 10, 'october': 10,
	'nov': 11, 'november': 11, 'dec': 12, 'december': 12,
	'01': 1, '02': 2, '03': 3, '04': 4,
	'05': 5, '06': 6, '07': 7, '08': 8,
	'09': 9, '10': 10, '11': 11, '12': 12
	}

	# Regex patterns to match different date formats
	pattern_mm_yyyy = re.compile(r'(\d{1,2})/(\d{4})')
	pattern_mm_yyyy_space = re.compile(r'(\d{1,2})\s(\d{4})')
	pattern_month_yyyy = re.compile(r'([a-zA-Z]+)\s?(\d{4})')
	pattern_yyyy = re.compile(r'(\d{4})')

	def extract_date(date_str):
	match_mm_yyyy = pattern_mm_yyyy.match(date_str)
	match_mm_yyyy_space = pattern_mm_yyyy_space.match(date_str)
	match_month_yyyy = pattern_month_yyyy.match(date_str)
	match_yyyy = pattern_yyyy.match(date_str)

	if match_mm_yyyy:
	month = int(match_mm_yyyy.group(1))
	year = int(match_mm_yyyy.group(2))
	elif match_mm_yyyy_space:
	month = int(match_mm_yyyy_space.group(1))
	year = int(match_mm_yyyy_space.group(2))
	elif match_month_yyyy:
	month = month_map.get(match_month_yyyy.group(1).lower())
	year = int(match_month_yyyy.group(2))
	elif match_yyyy:
	month = 1
	year = int(match_yyyy.group(1))
	else:
	return []

	return datetime.date(year, month, 1)

	date_parts = re.findall(r'(\d{4}\s[a-zA-Z]+\s?\|\d{4}[a-zA-Z]+\|\d{4}\/\d{2}\|\d{4}\s\d{2}\|[a-zA-Z]+\s?\d{4}\|\d{4}\s[a-zA-Z]+)', date_str)
	if len(date_parts) == 1:
	# Standalone year or single date
	start_date = extract_date(date_parts[0])
	end_date = datetime.date(start_date.year, start_date.month, start_date.day)
	elif len(date_parts) == 2:
	# Date range
	start_date = extract_date(date_parts[0])
	end_date = extract_date(date_parts[1])
	else:
	return []

	return start_date, end_date


	def date_time(self, date_parts):
	converted_dates = []
	for date_part in date_parts:
	start_date, end_date = self.convert_to_date(date_part)
	converted_dates.append((start_date, end_date))
	return converted_dates


	def check_chronological_order(self, converted_dates, section_name ):
	suggestion = ""
	sorted_dates = sorted(converted_dates, key=lambda x: (x[1], x[0]), reverse=True)
	if converted_dates == sorted_dates:
	suggestion = f"{section_name} section is in chronological order."
	else:
	suggestion = f"{section_name} section is not in chronological order."

	return suggestion

	def check_common_projects(self, projects_text):
	found_projects = []
	for project in common_projects:
	if project.lower() in projects_text.lower():
	found_projects.append(project)
	return found_projects

	def recommend_resources():
	# Randomly pick 2 blog articles and 2 YouTube links
	recommended_blogs = random.sample(blog_articles, 2)
	recommended_youtube = random.sample(youtube_links, 2)

	# Return the recommendations
	return {
	"Recommended Blogs": recommended_blogs,
	"Recommended YouTube Links": recommended_youtube
	}

	def check_imarticus_certifications(self, certifications_text):
	# Check if "imarticus" is present in the certifications text
	if "imarticus" in certifications_text.lower():
	return {
	"found": True,
	"message": "Imarticus certification found. Please upload it in the academic section."
	}
	return {
	"found": False,
	"message": "No Imarticus certification found in the provided text."
	}


	def chronological_order_check(self, sections_text, section_name):
	order_suggestion = ""
	suggestion = ""
	section_name = section_name.upper()
	if section_name in sections_text:
	date = self.parse_dates(sections_text, section_name)
	if date:
	converted_dates = self.date_time(date)
	order_suggestion = self.check_chronological_order(converted_dates, section_name)
	else:
	suggestion = f"No valid dates found in {section_name} section. "
	else:
	suggestion = f"{section_name} is not in section header. "

	return order_suggestion, suggestion



	# Function to check for spelling mistakes
	def check_spelling(self, headers, section_headers):
	suggestions = []
	for header in headers:
	if header.upper() not in map(str.upper, section_headers):
	suggestions = header
	return suggestions

	def is_present_name(name):
	"""
	Checks if a given name has at least 2 words.

	Args:
	name: The name string to check.

	Returns:
	True if it has at least 2 words, false otherwise.
	"""
	parts = name.split()
	return len(parts) >= 2

	def is_sentence_case(name):

	parts = name.split() # Split into individual words
	for part in parts:
	if not part: # handles empty strings in name
	continue
	if not part[0].isupper() or not part[1:].islower():
	return False # Check if first letter is uppercase and rest are lowercase
	return True

	def is_present_name(self,name):
	parts = name.split()
	return len(parts) >= 2

	def is_sentence_case(self,name):
	parts = name.split()
	for part in parts:
	if not part:
	continue
	if not part[0].isupper() or not part[1:].islower():
	return False
	return True

	def extract_project_links(self,sections_text):
	project_links = {}

	if "PROJECTS" in sections_text:
	project_list = sections_text.get("PROJECTS", [])
	url_pattern = r"https?://[^\s]+"
	for project in project_list:
	links = re.findall(url_pattern,project)
	if links:
	project_links[project] = links
	return project_links

	def count_sentences(self,text):
	sentence_endings = r"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.\|\?\|!)\s"
	sentences = re.split(sentence_endings, text)
	sentences = [s.strip() for s in sentences if s.strip()]
	return len(sentences)

	def calculate_summary_score(self, summary):
	score = 0 # Initialize score
	if not summary:
	return score

	num_sentences = self.count_sentences(summary)
	if num_sentences <= 2:
	return 2
	elif num_sentences > 2 and num_sentences <= 4:
	return 3
	elif num_sentences > 4:
	return 5
	else:
	return 0

	def calculate_extra_urls_bonus(self,pdf_path):
	domains = [
	r"hackerrank\.com", # Hackerrank
	r"leetcode\.com", # LeetCode
	r"medium\.com" # Medium
	]
	extra_urls = self.extract_extra_urls_pdf(pdf_path, domains)
	has_extra_urls = any(urls for urls in extra_urls.values())
	return 5 if has_extra_urls else 0

	def calculate_relevant_experience_score(self, experience_text):
	"""
	Assigns a score based on the presence of relevant experience keywords.

	Args:
	experience_text (str): The extracted work experience section text.

	Returns:
	int: A score of 5 if relevant keywords are found, otherwise 0.
	"""
	if not experience_text:
	return 0 # ✅ No experience section → Score 0

	if isinstance(experience_text, list):
	experience_text = " ".join(experience_text) # ✅ Convert list to a single string

	experience_text = experience_text.strip().lower() # ✅ Ensure it's a string and lowercase

	# ✅ Check if any keyword from 'data_science_skills' or 'essential_skills' exists
	for skill in config.data_science_skills + config.essential_skills:
	if skill.lower() in experience_text:
	return 5 # ✅ Found relevant experience → Full score

	return 0

	def calculate_ds_skills_score(self, skills_present):
	if not skills_present: # No skills found at all
	return 0

	# Use skills from config instead of hardcoded list
	ds_skills_list_lower = [skill.lower() for skill in config.data_science_skills]
	skills_present_lower = [skill.lower() for skill in skills_present]

	matching_count = sum(1 for skill in skills_present_lower
	if skill in ds_skills_list_lower)

	if matching_count == 0: # Skills found but none match DS list
	return 2
	elif 1 <= matching_count <= 5:
	return 3
	elif matching_count > 5:
	return 5
	return 0

	def calculate_project_link_score(self, projects_with_links):
	"""
	Assigns a score based on whether project links are present.

	Args:
	projects_with_links (int): The number of projects with links.

	Returns:
	int: 2 if project links are found, otherwise 0.
	"""
	return 2 if projects_with_links > 0 else 0


	def imarticus_review_score(self,name,contact_number,email,linkedin_urls,github_url,missing_sections,sections_not_capitalized,common_projects,section_order_suggestion,sections_text,skills,relevant_experience_score):
	score = 0
	if name:
	name_parts = name.split()
	num_parts = len(name_parts)

	if num_parts == 0:
	score += 0
	if self.is_sentence_case(name):
	score += 3
	elif self.is_present_name(name):
	score += 1.5

	if contact_number and isinstance(contact_number, str):
	digits_only = re.sub(r'\D', '', contact_number)

	if digits_only.startswith("91") and len(digits_only) > 10:
	digits_only = digits_only[2:] # Remove the first two characters ('91')

	if len(digits_only) == 10 and digits_only[0] in "6789": # Check for valid Indian mobile numbers
	score += 3

	if email:
	score += 3 if self.is_valid_email(email) else 0

	score += 3 if linkedin_urls else 0

	if github_url:
	github_suggestion = self.is_valid_url(github_url)
	score += 3 if not github_suggestion else 0
	else:
	score += 0

	if len(missing_sections)==0 and len(sections_not_capitalized)==0:
	score+=10
	elif len(missing_sections)==0 and len(sections_not_capitalized)>0:
	score+=8
	elif len(missing_sections)<=3:
	score+=6
	elif len(missing_sections)>4:
	score+=3

	if common_projects:
	score +=0
	else:
	score +=5

	if section_order_suggestion:
	score -= 2
	else:
	score

	"""
	ds_skills_list_lower = [skill.lower() for skill in data_science_skills]
	skills_present_lower = [skill.lower() for skill in self.extract_skills_from_resume(skills) ]

	matching_skill_count = 0
	for skill in skills_present_lower:
	if ds_skills_list_lower:
	matching_skill_count+=1
	if matching_skill_count==0:
	score+=0

	if matching_skill_count<=5:
	score+=2
	elif matching_skill_count>=10 and matching_skill_count<=15:
	score+5
	else:
	score+=8
	"""

	if "PROJECTS" not in sections_text:
	score+=0
	else:
	project_list = sections_text.get("PROJECTS",[])
	project_count = len([x for x in project_list if "Description" in x])

	if project_count<=2:
	score+=2
	elif project_count>2 and project_count<=4:
	score+=5
	elif project_count>4:
	score+=3


	resume_data = {}
	# Extract projects & links
	project_links = self.extract_project_links(sections_text)
	projects_with_links = len(project_links)

	# ✅ Count only projects with descriptions
	valid_projects = [
	p for p in sections_text.get("PROJECTS", []) if "description" in p.lower()
	]
	total_projects = len(valid_projects) # ✅ Count projects properly

	# ✅ Calculate project link score
	project_link_score = self.calculate_project_link_score(projects_with_links)
	resume_data["project_link_score"] = project_link_score

	# ✅ Prevent division by zero
	if total_projects > 0:
	if projects_with_links == 0:
	score += 0
	elif projects_with_links / total_projects >= 0.5:
	score += 1.5
	if projects_with_links == total_projects:
	score += 3
	else:
	score += 0 # ✅ Ensure no division error if no projects exist


	""""
	profile_summary = sections_text.get("PROFILE SUMMARY", "")
	print(profile_summary)

	summary_score = self.calculate_summary_score(profile_summary)
	score += summary_score
	"""
	ds_skills_score = self.calculate_ds_skills_score(skills)
	score += ds_skills_score

	certifications = sections_text.get("CERTIFICATIONS & ACADEMIC ENDEAVOURS", [])
	num_certifications = len(certifications)

	if num_certifications==0:
	score+=0
	elif 0 < num_certifications <= 2:
	score+=3
	elif 2 < num_certifications <= 4:
	score+=5
	elif num_certifications>4:
	score+=7
	"""
	extra_urls_bonus = self.calculate_extra_urls_bonus(pdf_path)
	score += extra_urls_bonus
	"""
	score += relevant_experience_score

	score += project_link_score

	return score


	def imarticus_detailed_score(self, name, contact_number, email, linkedin_urls, github_url,
	missing_sections=None, sections_not_capitalized=None, common_projects=None,
	section_order_suggestion=None, sections_text=None, skills=None,
	relevant_experience_score=0):

	# Ensure lists and dictionaries have default values to avoid 'NoneType' errors
	missing_sections = missing_sections or []
	sections_not_capitalized = sections_not_capitalized or []
	common_projects = common_projects or []
	sections_text = sections_text or {}

	score_breakdown = {
	"name_score": 0,
	"contact_number_score": 0,
	"email_score": 0,
	"linkedin_url_score": 0,
	"github_url_score": 0,
	"missing_sections_score": 0,
	"common_projects_score": 0,
	"section_order_score": 0,
	"projects_score": 0,
	"certifications_score": 0,
	"relevant_experience_score": 0,
	"ds_skills_score": 0,
	"extra_urls_bonus": 0,
	"summary_score": 0,
	"project_link_score": 0
	}

	# ✅ Name Score (3 Points)
	if name:
	if self.is_sentence_case(name):
	score_breakdown["name_score"] = 3
	elif self.is_present_name(name):
	score_breakdown["name_score"] = 1.5

	# ✅ Contact Number Score (3 Points)
	if contact_number and isinstance(contact_number, str):
	digits_only = re.sub(r'\D', '', contact_number)
	if digits_only.startswith("91") and len(digits_only) > 10:
	digits_only = digits_only[2:]
	if len(digits_only) == 10 and digits_only[0] in "6789":
	score_breakdown["contact_number_score"] = 3

	# ✅ Email Score (3 Points)
	score_breakdown["email_score"] = 3 if email and self.is_valid_email(email) else 0

	# ✅ LinkedIn URL Score (3 Points)
	score_breakdown["linkedin_url_score"] = 3 if linkedin_urls else 0

	# ✅ GitHub URL Score (3 Points)
	if github_url and self.is_valid_url(github_url):
	score_breakdown["github_url_score"] = 3

	# ✅ Missing Sections Score (10 Points)
	if not missing_sections and not sections_not_capitalized:
	score_breakdown["missing_sections_score"] = 10
	elif not missing_sections and sections_not_capitalized:
	score_breakdown["missing_sections_score"] = 8
	elif len(missing_sections) <= 3:
	score_breakdown["missing_sections_score"] = 6
	else:
	score_breakdown["missing_sections_score"] = 3

	# ✅ Common Projects Score (5 Points)
	score_breakdown["common_projects_score"] = 0 if common_projects else 5

	# ✅ Section Order Score (2 Points)
	score_breakdown["section_order_score"] = -2 if section_order_suggestion else 0

	# ✅ Projects Score (5 Points)
	if "PROJECTS" in sections_text:
	project_list = sections_text.get("PROJECTS", [])
	project_count = len([x for x in project_list if "Description" in x])
	if project_count <= 2:
	score_breakdown["projects_score"] = 2
	elif 2 < project_count <= 4:
	score_breakdown["projects_score"] = 5
	else:
	score_breakdown["projects_score"] = 3

	# ✅ Certifications Score (7 Points)
	certifications = sections_text.get("CERTIFICATIONS & ACADEMIC ENDEAVOURS", [])
	num_certifications = len(certifications)
	if num_certifications == 0:
	score_breakdown["certifications_score"] = 0
	elif 0 < num_certifications <= 2:
	score_breakdown["certifications_score"] = 3
	elif 2 < num_certifications <= 4:
	score_breakdown["certifications_score"] = 5
	else:
	score_breakdown["certifications_score"] = 7

	# ✅ Relevant Experience Score (5 Points)
	score_breakdown["relevant_experience_score"] = relevant_experience_score if relevant_experience_score is not None else 0

	# ✅ Data Science Skills Score (5 Points)
	score_breakdown["ds_skills_score"] = self.calculate_ds_skills_score(skills)

	# ✅ Extra URLs Bonus (5 Points)
	score_breakdown["extra_urls_bonus"] = self.calculate_extra_urls_bonus(sections_text)

	# ✅ Summary Score (5 Points)
	profile_summary = sections_text.get("PROFILE SUMMARY", "")
	score_breakdown["summary_score"] = self.calculate_summary_score(profile_summary)

	# ✅ Project Link Score (2 Points)
	project_links = self.extract_project_links(sections_text)
	projects_with_links = len(project_links)
	score_breakdown["project_link_score"] = self.calculate_project_link_score(projects_with_links)

	return score_breakdown

	def parse_text(self, path):
	logger = logging.getLogger(__name__)
	logging.getLogger("pdfminer").setLevel(logging.WARNING)
	resume_data = {}
	logger.debug('parsing text')
	text = self.extract_text_from_pdf(path)
	text1 = " ".join(text.split("\n"))
	skills_found = self.extract_skills_from_resume(text)
	found_keywords = self.extract_keyword_variations_from_resume(text)
	sections_text = self.segregate_sections(text)
	formatted_text = self.extract_and_format_sections(sections_text, Extract_sections)
	found_keyword_section = self.extract_keyword_variations_from_formatted_text(formatted_text)

	parsed_sections = self.segregate_sections(text)
	projects = parsed_sections.get("PROJECTS", [])
	certifications = parsed_sections.get("CERTIFICATIONS & ACADEMIC ENDEAVOURS", [])
	projects_text = "\n".join(projects)
	certifications_text = "\n".join(certifications)
	found_imarticus_certification = self.check_imarticus_certifications(certifications_text)
	found_projects = self.check_common_projects(projects_text)

	name, name_suggestion = self.extract_name(text)
	contact_number, contact_suggestion = self.extract_contact_number_from_resume(text)
	email, email_suggestion = self.extract_email_from_resume(path)
	github_urls = self.extract_github_urls_from_pdf(path)
	github_urls_suggestions = self.is_valid_url(github_urls)
	linkedin_urls = self.extract_linkedIn_urls_from_pdf(path)
	section_by_grammer_issues = self.grammar_issue_check(text, found_keyword_section, Extract_sections)


	domains = [
	r"hackerrank\.com", # Hackerrank
	r"leetcode\.com", # LeetCode
	r"medium\.com" # Medium
	]
	extra_urls = self.extract_extra_urls_pdf(path, domains)

	education_order_suggestion, education_suggestion = self.chronological_order_check(sections_text, "ACADEMIC PROFILE")
	experience_order_suggestion, experience_suggestion = self.chronological_order_check(sections_text, "WORK EXPERIENCE")

	headers = list(sections_text.keys())
	spelling_suggestions = self.check_spelling(headers, section_headers)

	predefined_terms = [name, email]
	predefined_terms.extend(required_sections)
	text_properties = self.extract_text_properties(path, predefined_terms)
	grouped_properties = self.group_similar_fonts(text_properties)
	different_texts = self.identify_different_fonts_and_sizes(grouped_properties)

	font_suggestions = []
	for item in different_texts:
	font_suggestion = f"Formatting issue at Page: {item['page_num']}, Text: {item['text']}, Reason: {item['reason']}, Found font size: {item['found_size']}, Found font name: {item['found_font_name']}"
	font_suggestions.append(font_suggestion)

	missing_sections, sections_not_capitalized = self.extract_sections_from_resume(text)

	linkedin_urls_suggestion = str()
	common_project = str()
	if not name:
	name_suggestion = "Please add name to the resume."
	if not contact_number:
	contact_suggestion = "Please add the contact number to the resume."
	if not email:
	email_suggestion = "Please add the email address to the resume."
	if not github_urls:
	github_urls_suggestions = "add the github_urls to the resume."
	if not linkedin_urls:
	linkedin_urls_suggestion = "add the linkedin_urls to the resume."
	if found_projects:
	common_project = "Common projects found in Projects section: "
	for project in found_projects:
	common_project += project

	# Replace the existing project length suggestion code with:
	project_list = sections_text.get("PROJECTS", [])
	projects_with_description = [
	p for p in project_list
	if "description" in p.lower()
	]
	project_count = len(projects_with_description)

	if project_count == 0:
	project_length_suggestion = "No projects found. Consider at least 2 projects."
	elif project_count == 1:
	project_length_suggestion = "Only 1 project found. Consider adding 1 more project."
	else:
	project_length_suggestion = f"{project_count} projects found."

	# Store in resume data (keeps your existing URL extraction)
	resume_data["project_length_suggestion"] = project_length_suggestion

	experience_text = sections_text.get("WORK EXPERIENCE", "") # ✅ Extract work experience section
	relevant_experience_score = self.calculate_relevant_experience_score(experience_text) # ✅ Calculate score

	# ✅ Store in the final resume data output
	resume_data["relevant_experience_score"] = relevant_experience_score


	recommended_blogs = random.sample(blog_articles, 2)
	recommended_youtube = random.sample(youtube_links, 2)

	# Calculate imarticus_score
	imarticus_score = self.imarticus_review_score(
	name,
	contact_number,
	email,
	linkedin_urls,
	github_urls,
	missing_sections,
	sections_not_capitalized,
	common_projects=found_projects, # Ensure to pass found projects
	section_order_suggestion=experience_order_suggestion,
	skills=skills_found, # Pass order suggestion
	sections_text=sections_text,
	relevant_experience_score=relevant_experience_score,
	# project_link_score=project_link_score
	#pdf_path=path
	#relevant_keywords_found=bool(found_keywords), # Convert to boolean
	#experience_orderly_arranged=experience_order_suggestion, # Pass orderly arrangement check
	#experience_section_present="WORK EXPERIENCE" in sections_text # Check if experience section is present
	)

	# Populate resume data dictionary
	resume_data = {
	"name": name,
	"contact_number": contact_number,
	"email": email,
	"linkedin_urls": linkedin_urls,
	"experience_order_suggestion": experience_order_suggestion,
	"education_order_suggestion": education_order_suggestion,
	"grammer_issues_by_section": section_by_grammer_issues,
	"github_urls": github_urls,
	"skills": skills_found,
	"spelling_suggestions": spelling_suggestions,
	"found_keywords": found_keywords,
	"text": text,
	"font_suggestions": font_suggestions,
	"name_suggestion": name_suggestion,
	"contact_suggestion": contact_suggestion,
	"email_suggestion": email_suggestion,
	"imarticus_score": imarticus_score,
	"github_urls_suggestions": github_urls_suggestions,
	"linkedin_urls_suggestion": "Add the LinkedIn URLs to the resume." if not linkedin_urls else "",
	"missing_sections": missing_sections,
	"common_projects": "Common projects found in Projects section: " + ", ".join(found_projects) if found_projects else "",
	"project_length_suggestion": project_length_suggestion,
	"extra_urls": extra_urls,
	"certifications": {
	"found": found_imarticus_certification["found"],
	"message": found_imarticus_certification["message"],
	"text": certifications_text # Store extracted certification text
	},
	"recommended_blogs": recommended_blogs,
	"recommended_youtube_links": recommended_youtube
	}

	# Additional checks and data additions
	if "WORK EXPERIENCE" in sections_text.keys() and "WORK EXPERIENCE" != list(sections_text.keys())[2]:
	section_order_suggestion = f"WORK EXPERIENCE should come before {list(sections_text.keys())[2]}"
	resume_data["section_order_suggestion"] = section_order_suggestion

	missing_important_sections = self.check_missing_sections(resume_data)
	resume_data["basic_information_section"] = missing_important_sections or "Basic information is Found"

	missing_skills = list(set(essential_skills) - set(skills_found))
	resume_data["missing_skills"] = missing_skills

	found_keywords_count = len(resume_data["found_keywords"])
	num_keywords = len(keyword_variations)
	quality_mapping = {"Low": 0.2, "Medium": 0.5, "High": 0.8} # Assuming some quality mapping
	for quality, threshold in quality_mapping.items():
	if found_keywords_count < num_keywords * threshold:
	resume_data["quality"] = quality
	break

	found_certification = "Imarticus certification found in Certifications section." if found_imarticus_certification else "No Imarticus certification found in Certifications section."
	resume_data["found_certification"] = found_certification

	# Experience relevance check
	Extract_exp_sections = ['WORK EXPERIENCE']
	experience_text = self.extract_and_format_sections(sections_text, Extract_exp_sections)
	if experience_text:
	resume_data["work_experience_check"] = "Experience is relevant to Data science." if any(variation.lower() in experience_text.lower() for keyword, variations in keyword_variations.items() for variation in variations) else "Experience is not relevant to Data science."

	return jsonify(resume_data)