ResumeExtractor2

Sleeping

App Files Files Community

WebashalarForML commited on 14 days ago

Commit

162497f

verified ·

1 Parent(s): 0844bb1

Update utils/mistral.py

Browse files

Files changed (1) hide show

utils/mistral.py +108 -124

utils/mistral.py CHANGED Viewed

@@ -2,8 +2,7 @@
 import os
 import json
 import logging
-from huggingface_hub import InferenceClient
-from huggingface_hub.utils._errors import BadRequestError
 from dotenv import load_dotenv
 from utils.fileTotext import extract_text_based_on_format
 import re
@@ -12,144 +11,153 @@ from utils.spacy import Parser_from_model
 # Load environment variables from .env file
 load_dotenv()
-# Authenticate with Hugging Face
-HFT = os.getenv('HF_TOKEN')
-if not HFT:
-    raise ValueError("Hugging Face token is not set in environment variables.")
-client = InferenceClient(model="mistralai/Mistral-Nemo-Instruct-2407", token=HFT)
 # Function to clean model output
 def Data_Cleaner(text):
     pattern = r".*?format:"
     result = re.split(pattern, text, maxsplit=1)
     if len(result) > 1:
-        # Handle edge cases where JSON might not be properly formatted after 'format:'
         text_after_format = result[1].strip().strip('`').strip('json')
     else:
         text_after_format = text.strip().strip('`').strip('json')
-    # Try to ensure valid JSON is returned
     try:
-        json.loads(text_after_format)  # Check if it's valid JSON
         return text_after_format
     except json.JSONDecodeError:
         logging.error("Data cleaning led to invalid JSON")
-        return text  # Return the original text if cleaning goes wrong
-# Function to call Mistral and process output
-def Model_ProfessionalDetails_Output(resume, client):
     system_role = {
-    "role": "system",
-    "content": "You are a skilled resume parser. Your task is to extract professional details from resumes in a structured JSON format defined by the User. Ensure accuracy and completeness while maintaining the format provided and if field are missing just return 'not found'."
     }
     user_prompt = {
-    "role": "user",
-    "content": f'''Act as a resume parser for the following text given in text: {resume}
-    Extract the text in the following output JSON string as:
-    {{
-        "professional": {{
-            "technical_skills": "List all technical skills, programming languages, frameworks, and technologies mentioned in the resume, ensuring they are not mixed with other skill types.",
-            "non_technical_skills": "Identify and list non-technical skills such as leadership, teamwork, and communication skills, ensuring they are not mixed with technical skills.",
-            "tools": "Enumerate all software tools, platforms, and applications (e.g., Figma, Unity, MS Office, etc.) referenced in the resume, distinctly separate from skills.",
-            "projects": "Extract the names or titles of all projects mentioned in the resume.",
-            "projects_experience": "Summarize overall project experiences, providing a brief description of each project as detailed in the resume.",
-            "experience": "Calculate total professional work experience in years and months based on the resume.",
-            "companies_worked_at": "List the names of all companies where employment is mentioned in the resume.",
-            "certifications": "Extract and list all certifications obtained as stated in the resume.",
-            "roles": "Include the names of all job titles or roles held as indicated in the resume.",
-            "qualifications": "List educational qualifications (e.g., B.Tech) from the resume. If none are found, return 'No education listed'.",
-            "courses": "Extract the names of completed courses based on the resume. If none are found, return 'No courses listed'.",
-            "university": "Identify the name of the university, college, or institute attended, based on the resume. If not found, return 'No university listed'.",
-            "year_of_graduation": "Extract the year of graduation from the resume. If not found, return 'No year of graduation listed'."
         }}
-    }}
-    Json Output:
-    '''
     }
-    response = ""
-    for message in client.chat_completion(messages=[system_role, user_prompt], max_tokens=4096, stream=True, temperature=0.35):
-        response += message.choices[0].delta.content
     try:
         clean_response = Data_Cleaner(response)
         parsed_response = json.loads(clean_response)
-    except json.JSONDecodeError as e:
-        logging.error(f"JSON Decode Error: {e}")
         return {}
     return parsed_response
-def Model_PersonalDetails_Output(resume, client):
     system_role = {
-    "role": "system",
-    "content": "You are a skilled resume parser. Your task is to extract professional details from resumes in a structured JSON format defined by the User. Ensure accuracy and completeness while maintaining the format provided and if field are missing just return 'not found'."
     }
     user_prompt = {
-    "role": "user",
-    "content": f'''Act as a resume parser for the following text given in text: {resume}
-    Extract the text in the following output JSON string as:
-    {{
-        "personal": {{
-            "name": "Extract the full name based on the resume. If not found, return 'No name listed'.",
-            "contact_number": "Extract the contact number from the resume. If not found, return 'No contact number listed'.",
-            "email": "Extract the email address from the resume. If not found, return 'No email listed'.",
-            "Address": "Extract the Address or address from the resume. If not found, return 'No Address listed'.",
-            "link": "Extract any relevant links (e.g., portfolio, LinkedIn) from the resume. If not found, return 'No link listed'."
-        }}
-    }}
-    output:
-    '''
     }
-    # Response
-    response = ""
-    for message in client.chat_completion(
-        messages=[system_role, user_prompt],
-        max_tokens=3000,
-        stream=True,
-        temperature=0.35,
-    ):
-        response += message.choices[0].delta.content
-    # Handle cases where the response might have formatting issues
     try:
-        #print('The Og response:-->',response)
-        clean_response=Data_Cleaner(response)
-        #print("After data cleaning",clean_response)
         parsed_response = json.loads(clean_response)
-    except json.JSONDecodeError as e:
         print("JSON Decode Error:", e)
-        print("Raw Response:", response)
         return {}
     return parsed_response
-# # Fallback to SpaCy if Mistral fails
-# Add regex pattern for LinkedIn and GitHub links
 linkedin_pattern = r"https?://(?:www\.)?linkedin\.com/[\w\-_/]+"
 github_pattern = r"https?://(?:www\.)?github\.com/[\w\-_/]+"
 email_pattern = r"^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$"
 contact_pattern = r"^\+?[\d\s\-()]{7,15}$"
 def extract_links(hyperlinks):
     linkedin_links = []
     github_links = []
-    # Iterate through the hyperlinks and apply regex to find LinkedIn and GitHub links
     for link in hyperlinks:
         if re.match(linkedin_pattern, link):
             linkedin_links.append(link)
         elif re.match(github_pattern, link):
             github_links.append(link)
     return linkedin_links, github_links
 def is_valid_email(email):
     email_regex = r'^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$'
     return re.match(email_regex, email) is not None
@@ -284,45 +292,34 @@ def is_valid_contact(contact):
 def validate_contact_email(personal_data):
     contact = personal_data.get('contact', 'Not found')
     email = personal_data.get('email', 'Not found')
     valid_contact = is_valid_contact(contact) if contact != 'Not found' else False
     valid_email = is_valid_email(email) if email != 'Not found' else False
     invalid_contact = 'Invalid contact' if not valid_contact else 'Valid contact'
     invalid_email = 'Invalid email' if not valid_email else 'Valid email'
     return valid_contact, invalid_contact, valid_email, invalid_email
 def process_resume_data(file_path):
     resume_text, hyperlinks = extract_text_based_on_format(file_path)
     print("Resume converted to text successfully.")
     if not resume_text:
         return {"error": "Text extraction failed"}
-    # Extract LinkedIn and GitHub links
     linkedin_links, github_links = extract_links(hyperlinks)
-    # Attempt to use Mistral model for parsing
     try:
-        # Extract personal details using Mistral
-        per_data = Model_PersonalDetails_Output(resume_text, client)
-        print(per_data)
-        # Extract professional details using Mistral
-        pro_data = Model_ProfessionalDetails_Output(resume_text, client)
-        print(pro_data)
-        # Check if per_data and pro_data have been populated correctly
         if not per_data:
-            logging.warning("Mistral personal data extraction failed.")
             per_data = {}
         if not pro_data:
-            logging.warning("Mistral professional data extraction failed.")
             pro_data = {}
-        # Combine both personal and professional details into a structured output
         result = {
             "personal": {
                 "name": per_data.get('personal', {}).get('name', 'Not found'),
@@ -331,7 +328,7 @@ def process_resume_data(file_path):
                 "location": per_data.get('personal', {}).get('Address', 'Not found'),
                 "linkedin": linkedin_links,
                 "github": github_links,
-                "other_links": hyperlinks  # Store remaining links if needed
             },
             "professional": {
                 "technical_skills": pro_data.get('professional', {}).get('technical_skills', 'Not found'),
@@ -356,34 +353,21 @@ def process_resume_data(file_path):
                 ]
             }
         }
-        # Validate contact and email
         valid_contact, invalid_contact, valid_email, invalid_email = validate_contact_email(result['personal'])
         result['personal']['valid_contact'] = valid_contact
         result['personal']['invalid_contact'] = invalid_contact
         result['personal']['valid_email'] = valid_email
         result['personal']['invalid_email'] = invalid_email
-        # If Mistral produces valid output, return it
         if per_data or pro_data:
-            logging.info("Successfully extracted data using Mistral.")
-            print(result)
-            print("---------Mistral-------")
             return result
         else:
-            raise ValueError("Mistral returned no output")
-    # Handle HuggingFace API or Mistral model errors
-    except BadRequestError as e:
-        logging.error(f"HuggingFace API error: {e}. Falling back to SpaCy.")
-        print(f"HuggingFace API error: {e}. Falling back to SpaCy.")
     except Exception as e:
-        logging.error(f"An error occurred while processing with Mistral: {e}. Falling back to SpaCy.")
-        print(f"An error occurred while processing with Mistral: {e}. Falling back to SpaCy.")
-    # Fallback to SpaCy if Mistral fails
-    logging.warning("Mistral failed, switching to SpaCy.")
     print("---------SpaCy-------")
     return Parser_from_model(file_path)

 import os
 import json
 import logging
+import requests
 from dotenv import load_dotenv
 from utils.fileTotext import extract_text_based_on_format
 import re
 # Load environment variables from .env file
 load_dotenv()
+# Authenticate with Groq
+GROQ_API_KEY = os.getenv('GROQ_API_KEY')
+if not GROQ_API_KEY:
+    raise ValueError("Groq API key is not set in environment variables.")
+GROQ_API_URL = "https://api.groq.com/openai/v1/chat/completions"
+MODEL_NAME = "llama3-70b-8192"   # you can switch to mixtral if needed
+# 🔥 Groq LLM Call (Replacement for HuggingFace)
+def call_llm(messages, max_tokens=2048, temperature=0.3):
+    headers = {
+        "Authorization": f"Bearer {GROQ_API_KEY}",
+        "Content-Type": "application/json"
+    }
+    payload = {
+        "model": MODEL_NAME,
+        "messages": messages,
+        "temperature": temperature,
+        "max_tokens": max_tokens
+    }
+    response = requests.post(GROQ_API_URL, headers=headers, json=payload)
+    if response.status_code != 200:
+        raise Exception(f"Groq API Error: {response.status_code} | {response.text}")
+    result = response.json()
+    return result["choices"][0]["message"]["content"]
 # Function to clean model output
 def Data_Cleaner(text):
     pattern = r".*?format:"
     result = re.split(pattern, text, maxsplit=1)
     if len(result) > 1:
         text_after_format = result[1].strip().strip('`').strip('json')
     else:
         text_after_format = text.strip().strip('`').strip('json')
     try:
+        json.loads(text_after_format)
         return text_after_format
     except json.JSONDecodeError:
         logging.error("Data cleaning led to invalid JSON")
+        return text
+# Function to call LLM and process output
+def Model_ProfessionalDetails_Output(resume, client=None):
     system_role = {
+        "role": "system",
+        "content": "You are a skilled resume parser. Your task is to extract professional details from resumes in a structured JSON format defined by the User. Ensure accuracy and completeness while maintaining the format provided and if field are missing just return 'not found'."
     }
     user_prompt = {
+        "role": "user",
+        "content": f'''Act as a resume parser for the following text given in text: {resume}
+        Extract the text in the following output JSON string as:
+        {{
+            "professional": {{
+                "technical_skills": "...",
+                "non_technical_skills": "...",
+                "tools": "...",
+                "projects": "...",
+                "projects_experience": "...",
+                "experience": "...",
+                "companies_worked_at": "...",
+                "certifications": "...",
+                "roles": "...",
+                "qualifications": "...",
+                "courses": "...",
+                "university": "...",
+                "year_of_graduation": "..."
+            }}
         }}
+        Json Output:
+        '''
     }
     try:
+        response = call_llm([system_role, user_prompt], max_tokens=3000, temperature=0.35)
         clean_response = Data_Cleaner(response)
         parsed_response = json.loads(clean_response)
+    except Exception as e:
+        logging.error(f"LLM Error: {e}")
         return {}
     return parsed_response
+def Model_PersonalDetails_Output(resume, client=None):
     system_role = {
+        "role": "system",
+        "content": "You are a skilled resume parser. Your task is to extract professional details from resumes in a structured JSON format defined by the User. Ensure accuracy and completeness while maintaining the format provided and if field are missing just return 'not found'."
     }
     user_prompt = {
+        "role": "user",
+        "content": f'''Act as a resume parser for the following text given in text: {resume}
+        Extract the text in the following output JSON string as:
+        {{
+            "personal": {{
+                "name": "...",
+                "contact_number": "...",
+                "email": "...",
+                "Address": "...",
+                "link": "..."
+            }}
+        }}
+        output:
+        '''
     }
     try:
+        response = call_llm([system_role, user_prompt], max_tokens=2000, temperature=0.35)
+        clean_response = Data_Cleaner(response)
         parsed_response = json.loads(clean_response)
+    except Exception as e:
         print("JSON Decode Error:", e)
         return {}
     return parsed_response
+# ------------------- REST OF YOUR CODE UNCHANGED -------------------
 linkedin_pattern = r"https?://(?:www\.)?linkedin\.com/[\w\-_/]+"
 github_pattern = r"https?://(?:www\.)?github\.com/[\w\-_/]+"
 email_pattern = r"^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$"
 contact_pattern = r"^\+?[\d\s\-()]{7,15}$"
 def extract_links(hyperlinks):
     linkedin_links = []
     github_links = []
     for link in hyperlinks:
         if re.match(linkedin_pattern, link):
             linkedin_links.append(link)
         elif re.match(github_pattern, link):
             github_links.append(link)
     return linkedin_links, github_links
 def is_valid_email(email):
     email_regex = r'^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$'
     return re.match(email_regex, email) is not None
 def validate_contact_email(personal_data):
     contact = personal_data.get('contact', 'Not found')
     email = personal_data.get('email', 'Not found')
     valid_contact = is_valid_contact(contact) if contact != 'Not found' else False
     valid_email = is_valid_email(email) if email != 'Not found' else False
     invalid_contact = 'Invalid contact' if not valid_contact else 'Valid contact'
     invalid_email = 'Invalid email' if not valid_email else 'Valid email'
     return valid_contact, invalid_contact, valid_email, invalid_email
 def process_resume_data(file_path):
     resume_text, hyperlinks = extract_text_based_on_format(file_path)
     print("Resume converted to text successfully.")
     if not resume_text:
         return {"error": "Text extraction failed"}
     linkedin_links, github_links = extract_links(hyperlinks)
     try:
+        per_data = Model_PersonalDetails_Output(resume_text)
+        pro_data = Model_ProfessionalDetails_Output(resume_text)
         if not per_data:
             per_data = {}
         if not pro_data:
             pro_data = {}
         result = {
             "personal": {
                 "name": per_data.get('personal', {}).get('name', 'Not found'),
                 "location": per_data.get('personal', {}).get('Address', 'Not found'),
                 "linkedin": linkedin_links,
                 "github": github_links,
+                "other_links": hyperlinks
             },
             "professional": {
                 "technical_skills": pro_data.get('professional', {}).get('technical_skills', 'Not found'),
                 ]
             }
         }
         valid_contact, invalid_contact, valid_email, invalid_email = validate_contact_email(result['personal'])
         result['personal']['valid_contact'] = valid_contact
         result['personal']['invalid_contact'] = invalid_contact
         result['personal']['valid_email'] = valid_email
         result['personal']['invalid_email'] = invalid_email
         if per_data or pro_data:
+            print("---------LLM (Groq)-------")
             return result
         else:
+            raise ValueError("LLM returned no output")
     except Exception as e:
+        logging.error(f"LLM failed: {e}. Falling back to SpaCy.")
     print("---------SpaCy-------")
     return Parser_from_model(file_path)