Spaces:
Sleeping
Sleeping
Update utils/mistral.py
Browse files- utils/mistral.py +25 -2
utils/mistral.py
CHANGED
|
@@ -42,7 +42,7 @@ def Data_Cleaner(text):
|
|
| 42 |
def Model_ProfessionalDetails_Output(resume, client):
|
| 43 |
system_role = {
|
| 44 |
"role": "system",
|
| 45 |
-
"content": "You are a skilled resume parser. Your task is to extract
|
| 46 |
}
|
| 47 |
user_prompt = {
|
| 48 |
"role": "user",
|
|
@@ -292,7 +292,25 @@ def validate_contact_email(personal_data):
|
|
| 292 |
|
| 293 |
return valid_contact, invalid_contact, valid_email, invalid_email
|
| 294 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 295 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 296 |
def process_resume_data(file_path):
|
| 297 |
resume_text, hyperlinks = extract_text_based_on_format(file_path)
|
| 298 |
print("Resume converted to text successfully.")
|
|
@@ -312,6 +330,11 @@ def process_resume_data(file_path):
|
|
| 312 |
# Extract professional details using Mistral
|
| 313 |
pro_data = Model_ProfessionalDetails_Output(resume_text, client)
|
| 314 |
print(pro_data)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 315 |
# Check if per_data and pro_data have been populated correctly
|
| 316 |
if not per_data:
|
| 317 |
logging.warning("Mistral personal data extraction failed.")
|
|
@@ -360,7 +383,7 @@ def process_resume_data(file_path):
|
|
| 360 |
|
| 361 |
#Appending the list if any available as a text
|
| 362 |
result['personal']['other_links'] += per_data.get('personal', {}).get('link', 'Not found')
|
| 363 |
-
|
| 364 |
#Added the validator for details, Validate contact and email
|
| 365 |
valid_contact, invalid_contact, valid_email, invalid_email = validate_contact_email(result['personal'])
|
| 366 |
result['personal']['valid_contact'] = valid_contact
|
|
|
|
| 42 |
def Model_ProfessionalDetails_Output(resume, client):
|
| 43 |
system_role = {
|
| 44 |
"role": "system",
|
| 45 |
+
"content": "You are a skilled resume parser. Your task is to extract Professional details as well as Academic details from resumes in a structured JSON format defined by the User. Ensure accuracy and completeness while maintaining the format provided and if field are missing just return []."
|
| 46 |
}
|
| 47 |
user_prompt = {
|
| 48 |
"role": "user",
|
|
|
|
| 292 |
|
| 293 |
return valid_contact, invalid_contact, valid_email, invalid_email
|
| 294 |
|
| 295 |
+
#Extracting the Data Using the Regex if the model don't extract Contact details
|
| 296 |
+
def extract_link_details(text):
|
| 297 |
+
# Regex patterns
|
| 298 |
+
|
| 299 |
+
# Email regex
|
| 300 |
+
email_regex = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b')
|
| 301 |
+
|
| 302 |
+
# URL and links regex, updated to avoid conflicts with email domains
|
| 303 |
+
link_regex = re.compile(r'\b(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9-]+\.(?:com|co\.in|co|io|org|net|edu|gov|mil|int|uk|us|in|de|au|app|tech|xyz|info|biz|fr|dev)\b')
|
| 304 |
+
|
| 305 |
+
emails = email_regex.findall(text)
|
| 306 |
|
| 307 |
+
links_RE = [link for link in link_regex.findall(text) if len(link)>=11]
|
| 308 |
+
|
| 309 |
+
# Remove profile links that might conflict with emails
|
| 310 |
+
links_RE = [link for link in links_RE if not any(email in link for email in emails)]
|
| 311 |
+
|
| 312 |
+
return links_RE
|
| 313 |
+
|
| 314 |
def process_resume_data(file_path):
|
| 315 |
resume_text, hyperlinks = extract_text_based_on_format(file_path)
|
| 316 |
print("Resume converted to text successfully.")
|
|
|
|
| 330 |
# Extract professional details using Mistral
|
| 331 |
pro_data = Model_ProfessionalDetails_Output(resume_text, client)
|
| 332 |
print(pro_data)
|
| 333 |
+
|
| 334 |
+
# Extract link using Regular Expression
|
| 335 |
+
links = extract_link_details(resume_text)
|
| 336 |
+
print(links)
|
| 337 |
+
|
| 338 |
# Check if per_data and pro_data have been populated correctly
|
| 339 |
if not per_data:
|
| 340 |
logging.warning("Mistral personal data extraction failed.")
|
|
|
|
| 383 |
|
| 384 |
#Appending the list if any available as a text
|
| 385 |
result['personal']['other_links'] += per_data.get('personal', {}).get('link', 'Not found')
|
| 386 |
+
result['personal']['other_links'] += links
|
| 387 |
#Added the validator for details, Validate contact and email
|
| 388 |
valid_contact, invalid_contact, valid_email, invalid_email = validate_contact_email(result['personal'])
|
| 389 |
result['personal']['valid_contact'] = valid_contact
|