ResumeExtractor3

Sleeping

App Files Files Community

WebashalarForML commited on Oct 17, 2024

Commit

d581df7

verified ·

1 Parent(s): f08d659

Update utils/mistral.py

Browse files

Files changed (1) hide show

utils/mistral.py +25 -2

utils/mistral.py CHANGED Viewed

@@ -42,7 +42,7 @@ def Data_Cleaner(text):
 def Model_ProfessionalDetails_Output(resume, client):
     system_role = {
     "role": "system",
-    "content": "You are a skilled resume parser. Your task is to extract professional details from resumes in a structured JSON format defined by the User. Ensure accuracy and completeness while maintaining the format provided and if field are missing just return []."
     }
     user_prompt = {
     "role": "user",
@@ -292,7 +292,25 @@ def validate_contact_email(personal_data):
     return valid_contact, invalid_contact, valid_email, invalid_email
 def process_resume_data(file_path):
     resume_text, hyperlinks = extract_text_based_on_format(file_path)
     print("Resume converted to text successfully.")
@@ -312,6 +330,11 @@ def process_resume_data(file_path):
         # Extract professional details using Mistral
         pro_data = Model_ProfessionalDetails_Output(resume_text, client)
         print(pro_data)
         # Check if per_data and pro_data have been populated correctly
         if not per_data:
             logging.warning("Mistral personal data extraction failed.")
@@ -360,7 +383,7 @@ def process_resume_data(file_path):
         #Appending the list if any available as a text
         result['personal']['other_links'] += per_data.get('personal', {}).get('link', 'Not found')
         #Added the validator for details, Validate contact and email
         valid_contact, invalid_contact, valid_email, invalid_email = validate_contact_email(result['personal'])
         result['personal']['valid_contact'] = valid_contact

 def Model_ProfessionalDetails_Output(resume, client):
     system_role = {
     "role": "system",
+    "content": "You are a skilled resume parser. Your task is to extract Professional details as well as Academic details from resumes in a structured JSON format defined by the User. Ensure accuracy and completeness while maintaining the format provided and if field are missing just return []."
     }
     user_prompt = {
     "role": "user",
     return valid_contact, invalid_contact, valid_email, invalid_email
+#Extracting the Data Using the Regex if the model don't extract Contact details
+def extract_link_details(text):
+    # Regex patterns
+    # Email regex
+    email_regex = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b')
+    # URL and links regex, updated to avoid conflicts with email domains
+    link_regex = re.compile(r'\b(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9-]+\.(?:com|co\.in|co|io|org|net|edu|gov|mil|int|uk|us|in|de|au|app|tech|xyz|info|biz|fr|dev)\b')
+    emails = email_regex.findall(text)
+    links_RE = [link for link in link_regex.findall(text) if len(link)>=11]
+    # Remove profile links that might conflict with emails
+    links_RE = [link for link in links_RE if not any(email in link for email in emails)]
+    return links_RE
 def process_resume_data(file_path):
     resume_text, hyperlinks = extract_text_based_on_format(file_path)
     print("Resume converted to text successfully.")
         # Extract professional details using Mistral
         pro_data = Model_ProfessionalDetails_Output(resume_text, client)
         print(pro_data)
+        # Extract link using Regular Expression
+        links = extract_link_details(resume_text)
+        print(links)
         # Check if per_data and pro_data have been populated correctly
         if not per_data:
             logging.warning("Mistral personal data extraction failed.")
         #Appending the list if any available as a text
         result['personal']['other_links'] += per_data.get('personal', {}).get('link', 'Not found')
+        result['personal']['other_links'] += links
         #Added the validator for details, Validate contact and email
         valid_contact, invalid_contact, valid_email, invalid_email = validate_contact_email(result['personal'])
         result['personal']['valid_contact'] = valid_contact