WebashalarForML commited on
Commit
162497f
·
verified ·
1 Parent(s): 0844bb1

Update utils/mistral.py

Browse files
Files changed (1) hide show
  1. utils/mistral.py +108 -124
utils/mistral.py CHANGED
@@ -2,8 +2,7 @@
2
  import os
3
  import json
4
  import logging
5
- from huggingface_hub import InferenceClient
6
- from huggingface_hub.utils._errors import BadRequestError
7
  from dotenv import load_dotenv
8
  from utils.fileTotext import extract_text_based_on_format
9
  import re
@@ -12,144 +11,153 @@ from utils.spacy import Parser_from_model
12
  # Load environment variables from .env file
13
  load_dotenv()
14
 
15
- # Authenticate with Hugging Face
16
- HFT = os.getenv('HF_TOKEN')
17
- if not HFT:
18
- raise ValueError("Hugging Face token is not set in environment variables.")
19
- client = InferenceClient(model="mistralai/Mistral-Nemo-Instruct-2407", token=HFT)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
  # Function to clean model output
22
  def Data_Cleaner(text):
23
  pattern = r".*?format:"
24
  result = re.split(pattern, text, maxsplit=1)
25
  if len(result) > 1:
26
- # Handle edge cases where JSON might not be properly formatted after 'format:'
27
  text_after_format = result[1].strip().strip('`').strip('json')
28
  else:
29
  text_after_format = text.strip().strip('`').strip('json')
30
 
31
- # Try to ensure valid JSON is returned
32
  try:
33
- json.loads(text_after_format) # Check if it's valid JSON
34
  return text_after_format
35
  except json.JSONDecodeError:
36
  logging.error("Data cleaning led to invalid JSON")
37
- return text # Return the original text if cleaning goes wrong
38
 
39
 
40
- # Function to call Mistral and process output
41
- def Model_ProfessionalDetails_Output(resume, client):
42
  system_role = {
43
- "role": "system",
44
- "content": "You are a skilled resume parser. Your task is to extract professional details from resumes in a structured JSON format defined by the User. Ensure accuracy and completeness while maintaining the format provided and if field are missing just return 'not found'."
45
  }
 
46
  user_prompt = {
47
- "role": "user",
48
- "content": f'''Act as a resume parser for the following text given in text: {resume}
49
- Extract the text in the following output JSON string as:
50
- {{
51
- "professional": {{
52
- "technical_skills": "List all technical skills, programming languages, frameworks, and technologies mentioned in the resume, ensuring they are not mixed with other skill types.",
53
- "non_technical_skills": "Identify and list non-technical skills such as leadership, teamwork, and communication skills, ensuring they are not mixed with technical skills.",
54
- "tools": "Enumerate all software tools, platforms, and applications (e.g., Figma, Unity, MS Office, etc.) referenced in the resume, distinctly separate from skills.",
55
- "projects": "Extract the names or titles of all projects mentioned in the resume.",
56
- "projects_experience": "Summarize overall project experiences, providing a brief description of each project as detailed in the resume.",
57
- "experience": "Calculate total professional work experience in years and months based on the resume.",
58
- "companies_worked_at": "List the names of all companies where employment is mentioned in the resume.",
59
- "certifications": "Extract and list all certifications obtained as stated in the resume.",
60
- "roles": "Include the names of all job titles or roles held as indicated in the resume.",
61
- "qualifications": "List educational qualifications (e.g., B.Tech) from the resume. If none are found, return 'No education listed'.",
62
- "courses": "Extract the names of completed courses based on the resume. If none are found, return 'No courses listed'.",
63
- "university": "Identify the name of the university, college, or institute attended, based on the resume. If not found, return 'No university listed'.",
64
- "year_of_graduation": "Extract the year of graduation from the resume. If not found, return 'No year of graduation listed'."
 
65
  }}
66
- }}
67
- Json Output:
68
- '''
69
  }
70
 
71
-
72
- response = ""
73
- for message in client.chat_completion(messages=[system_role, user_prompt], max_tokens=4096, stream=True, temperature=0.35):
74
- response += message.choices[0].delta.content
75
-
76
  try:
 
77
  clean_response = Data_Cleaner(response)
78
  parsed_response = json.loads(clean_response)
79
- except json.JSONDecodeError as e:
80
- logging.error(f"JSON Decode Error: {e}")
81
  return {}
82
-
83
  return parsed_response
84
 
85
- def Model_PersonalDetails_Output(resume, client):
 
86
  system_role = {
87
- "role": "system",
88
- "content": "You are a skilled resume parser. Your task is to extract professional details from resumes in a structured JSON format defined by the User. Ensure accuracy and completeness while maintaining the format provided and if field are missing just return 'not found'."
89
  }
 
90
  user_prompt = {
91
- "role": "user",
92
- "content": f'''Act as a resume parser for the following text given in text: {resume}
93
- Extract the text in the following output JSON string as:
94
- {{
95
- "personal": {{
96
- "name": "Extract the full name based on the resume. If not found, return 'No name listed'.",
97
- "contact_number": "Extract the contact number from the resume. If not found, return 'No contact number listed'.",
98
- "email": "Extract the email address from the resume. If not found, return 'No email listed'.",
99
- "Address": "Extract the Address or address from the resume. If not found, return 'No Address listed'.",
100
- "link": "Extract any relevant links (e.g., portfolio, LinkedIn) from the resume. If not found, return 'No link listed'."
101
- }}
102
- }}
103
- output:
104
- '''
105
  }
106
 
107
- # Response
108
- response = ""
109
- for message in client.chat_completion(
110
- messages=[system_role, user_prompt],
111
- max_tokens=3000,
112
- stream=True,
113
- temperature=0.35,
114
- ):
115
- response += message.choices[0].delta.content
116
-
117
- # Handle cases where the response might have formatting issues
118
  try:
119
- #print('The Og response:-->',response)
120
- clean_response=Data_Cleaner(response)
121
- #print("After data cleaning",clean_response)
122
  parsed_response = json.loads(clean_response)
123
-
124
- except json.JSONDecodeError as e:
125
  print("JSON Decode Error:", e)
126
- print("Raw Response:", response)
127
  return {}
128
 
129
  return parsed_response
130
 
131
 
132
- # # Fallback to SpaCy if Mistral fails
133
 
134
- # Add regex pattern for LinkedIn and GitHub links
135
  linkedin_pattern = r"https?://(?:www\.)?linkedin\.com/[\w\-_/]+"
136
  github_pattern = r"https?://(?:www\.)?github\.com/[\w\-_/]+"
137
  email_pattern = r"^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$"
138
  contact_pattern = r"^\+?[\d\s\-()]{7,15}$"
139
 
 
140
  def extract_links(hyperlinks):
141
  linkedin_links = []
142
  github_links = []
143
-
144
- # Iterate through the hyperlinks and apply regex to find LinkedIn and GitHub links
145
  for link in hyperlinks:
146
  if re.match(linkedin_pattern, link):
147
  linkedin_links.append(link)
148
  elif re.match(github_pattern, link):
149
  github_links.append(link)
150
-
151
  return linkedin_links, github_links
152
 
 
153
  def is_valid_email(email):
154
  email_regex = r'^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$'
155
  return re.match(email_regex, email) is not None
@@ -284,45 +292,34 @@ def is_valid_contact(contact):
284
  def validate_contact_email(personal_data):
285
  contact = personal_data.get('contact', 'Not found')
286
  email = personal_data.get('email', 'Not found')
287
-
288
  valid_contact = is_valid_contact(contact) if contact != 'Not found' else False
289
  valid_email = is_valid_email(email) if email != 'Not found' else False
290
 
291
  invalid_contact = 'Invalid contact' if not valid_contact else 'Valid contact'
292
  invalid_email = 'Invalid email' if not valid_email else 'Valid email'
293
-
294
  return valid_contact, invalid_contact, valid_email, invalid_email
295
 
296
 
297
  def process_resume_data(file_path):
298
  resume_text, hyperlinks = extract_text_based_on_format(file_path)
299
  print("Resume converted to text successfully.")
300
-
301
  if not resume_text:
302
  return {"error": "Text extraction failed"}
303
-
304
- # Extract LinkedIn and GitHub links
305
  linkedin_links, github_links = extract_links(hyperlinks)
306
-
307
- # Attempt to use Mistral model for parsing
308
  try:
309
- # Extract personal details using Mistral
310
- per_data = Model_PersonalDetails_Output(resume_text, client)
311
- print(per_data)
312
-
313
- # Extract professional details using Mistral
314
- pro_data = Model_ProfessionalDetails_Output(resume_text, client)
315
- print(pro_data)
316
- # Check if per_data and pro_data have been populated correctly
317
  if not per_data:
318
- logging.warning("Mistral personal data extraction failed.")
319
  per_data = {}
320
-
321
  if not pro_data:
322
- logging.warning("Mistral professional data extraction failed.")
323
  pro_data = {}
324
-
325
- # Combine both personal and professional details into a structured output
326
  result = {
327
  "personal": {
328
  "name": per_data.get('personal', {}).get('name', 'Not found'),
@@ -331,7 +328,7 @@ def process_resume_data(file_path):
331
  "location": per_data.get('personal', {}).get('Address', 'Not found'),
332
  "linkedin": linkedin_links,
333
  "github": github_links,
334
- "other_links": hyperlinks # Store remaining links if needed
335
  },
336
  "professional": {
337
  "technical_skills": pro_data.get('professional', {}).get('technical_skills', 'Not found'),
@@ -356,34 +353,21 @@ def process_resume_data(file_path):
356
  ]
357
  }
358
  }
359
-
360
- # Validate contact and email
361
  valid_contact, invalid_contact, valid_email, invalid_email = validate_contact_email(result['personal'])
362
  result['personal']['valid_contact'] = valid_contact
363
  result['personal']['invalid_contact'] = invalid_contact
364
  result['personal']['valid_email'] = valid_email
365
  result['personal']['invalid_email'] = invalid_email
366
-
367
- # If Mistral produces valid output, return it
368
  if per_data or pro_data:
369
- logging.info("Successfully extracted data using Mistral.")
370
- print(result)
371
- print("---------Mistral-------")
372
  return result
373
  else:
374
- raise ValueError("Mistral returned no output")
375
-
376
- # Handle HuggingFace API or Mistral model errors
377
- except BadRequestError as e:
378
- logging.error(f"HuggingFace API error: {e}. Falling back to SpaCy.")
379
- print(f"HuggingFace API error: {e}. Falling back to SpaCy.")
380
  except Exception as e:
381
- logging.error(f"An error occurred while processing with Mistral: {e}. Falling back to SpaCy.")
382
- print(f"An error occurred while processing with Mistral: {e}. Falling back to SpaCy.")
383
-
384
- # Fallback to SpaCy if Mistral fails
385
- logging.warning("Mistral failed, switching to SpaCy.")
386
  print("---------SpaCy-------")
387
  return Parser_from_model(file_path)
388
-
389
-
 
2
  import os
3
  import json
4
  import logging
5
+ import requests
 
6
  from dotenv import load_dotenv
7
  from utils.fileTotext import extract_text_based_on_format
8
  import re
 
11
  # Load environment variables from .env file
12
  load_dotenv()
13
 
14
+ # Authenticate with Groq
15
+ GROQ_API_KEY = os.getenv('GROQ_API_KEY')
16
+ if not GROQ_API_KEY:
17
+ raise ValueError("Groq API key is not set in environment variables.")
18
+
19
+ GROQ_API_URL = "https://api.groq.com/openai/v1/chat/completions"
20
+ MODEL_NAME = "llama3-70b-8192" # you can switch to mixtral if needed
21
+
22
+
23
+ # 🔥 Groq LLM Call (Replacement for HuggingFace)
24
+ def call_llm(messages, max_tokens=2048, temperature=0.3):
25
+ headers = {
26
+ "Authorization": f"Bearer {GROQ_API_KEY}",
27
+ "Content-Type": "application/json"
28
+ }
29
+
30
+ payload = {
31
+ "model": MODEL_NAME,
32
+ "messages": messages,
33
+ "temperature": temperature,
34
+ "max_tokens": max_tokens
35
+ }
36
+
37
+ response = requests.post(GROQ_API_URL, headers=headers, json=payload)
38
+
39
+ if response.status_code != 200:
40
+ raise Exception(f"Groq API Error: {response.status_code} | {response.text}")
41
+
42
+ result = response.json()
43
+ return result["choices"][0]["message"]["content"]
44
+
45
 
46
  # Function to clean model output
47
  def Data_Cleaner(text):
48
  pattern = r".*?format:"
49
  result = re.split(pattern, text, maxsplit=1)
50
  if len(result) > 1:
 
51
  text_after_format = result[1].strip().strip('`').strip('json')
52
  else:
53
  text_after_format = text.strip().strip('`').strip('json')
54
 
 
55
  try:
56
+ json.loads(text_after_format)
57
  return text_after_format
58
  except json.JSONDecodeError:
59
  logging.error("Data cleaning led to invalid JSON")
60
+ return text
61
 
62
 
63
+ # Function to call LLM and process output
64
+ def Model_ProfessionalDetails_Output(resume, client=None):
65
  system_role = {
66
+ "role": "system",
67
+ "content": "You are a skilled resume parser. Your task is to extract professional details from resumes in a structured JSON format defined by the User. Ensure accuracy and completeness while maintaining the format provided and if field are missing just return 'not found'."
68
  }
69
+
70
  user_prompt = {
71
+ "role": "user",
72
+ "content": f'''Act as a resume parser for the following text given in text: {resume}
73
+ Extract the text in the following output JSON string as:
74
+ {{
75
+ "professional": {{
76
+ "technical_skills": "...",
77
+ "non_technical_skills": "...",
78
+ "tools": "...",
79
+ "projects": "...",
80
+ "projects_experience": "...",
81
+ "experience": "...",
82
+ "companies_worked_at": "...",
83
+ "certifications": "...",
84
+ "roles": "...",
85
+ "qualifications": "...",
86
+ "courses": "...",
87
+ "university": "...",
88
+ "year_of_graduation": "..."
89
+ }}
90
  }}
91
+ Json Output:
92
+ '''
 
93
  }
94
 
 
 
 
 
 
95
  try:
96
+ response = call_llm([system_role, user_prompt], max_tokens=3000, temperature=0.35)
97
  clean_response = Data_Cleaner(response)
98
  parsed_response = json.loads(clean_response)
99
+ except Exception as e:
100
+ logging.error(f"LLM Error: {e}")
101
  return {}
102
+
103
  return parsed_response
104
 
105
+
106
+ def Model_PersonalDetails_Output(resume, client=None):
107
  system_role = {
108
+ "role": "system",
109
+ "content": "You are a skilled resume parser. Your task is to extract professional details from resumes in a structured JSON format defined by the User. Ensure accuracy and completeness while maintaining the format provided and if field are missing just return 'not found'."
110
  }
111
+
112
  user_prompt = {
113
+ "role": "user",
114
+ "content": f'''Act as a resume parser for the following text given in text: {resume}
115
+ Extract the text in the following output JSON string as:
116
+ {{
117
+ "personal": {{
118
+ "name": "...",
119
+ "contact_number": "...",
120
+ "email": "...",
121
+ "Address": "...",
122
+ "link": "..."
123
+ }}
124
+ }}
125
+ output:
126
+ '''
127
  }
128
 
 
 
 
 
 
 
 
 
 
 
 
129
  try:
130
+ response = call_llm([system_role, user_prompt], max_tokens=2000, temperature=0.35)
131
+ clean_response = Data_Cleaner(response)
 
132
  parsed_response = json.loads(clean_response)
133
+ except Exception as e:
 
134
  print("JSON Decode Error:", e)
 
135
  return {}
136
 
137
  return parsed_response
138
 
139
 
140
+ # ------------------- REST OF YOUR CODE UNCHANGED -------------------
141
 
 
142
  linkedin_pattern = r"https?://(?:www\.)?linkedin\.com/[\w\-_/]+"
143
  github_pattern = r"https?://(?:www\.)?github\.com/[\w\-_/]+"
144
  email_pattern = r"^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$"
145
  contact_pattern = r"^\+?[\d\s\-()]{7,15}$"
146
 
147
+
148
  def extract_links(hyperlinks):
149
  linkedin_links = []
150
  github_links = []
151
+
 
152
  for link in hyperlinks:
153
  if re.match(linkedin_pattern, link):
154
  linkedin_links.append(link)
155
  elif re.match(github_pattern, link):
156
  github_links.append(link)
157
+
158
  return linkedin_links, github_links
159
 
160
+
161
  def is_valid_email(email):
162
  email_regex = r'^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$'
163
  return re.match(email_regex, email) is not None
 
292
  def validate_contact_email(personal_data):
293
  contact = personal_data.get('contact', 'Not found')
294
  email = personal_data.get('email', 'Not found')
295
+
296
  valid_contact = is_valid_contact(contact) if contact != 'Not found' else False
297
  valid_email = is_valid_email(email) if email != 'Not found' else False
298
 
299
  invalid_contact = 'Invalid contact' if not valid_contact else 'Valid contact'
300
  invalid_email = 'Invalid email' if not valid_email else 'Valid email'
301
+
302
  return valid_contact, invalid_contact, valid_email, invalid_email
303
 
304
 
305
  def process_resume_data(file_path):
306
  resume_text, hyperlinks = extract_text_based_on_format(file_path)
307
  print("Resume converted to text successfully.")
308
+
309
  if not resume_text:
310
  return {"error": "Text extraction failed"}
311
+
 
312
  linkedin_links, github_links = extract_links(hyperlinks)
313
+
 
314
  try:
315
+ per_data = Model_PersonalDetails_Output(resume_text)
316
+ pro_data = Model_ProfessionalDetails_Output(resume_text)
317
+
 
 
 
 
 
318
  if not per_data:
 
319
  per_data = {}
 
320
  if not pro_data:
 
321
  pro_data = {}
322
+
 
323
  result = {
324
  "personal": {
325
  "name": per_data.get('personal', {}).get('name', 'Not found'),
 
328
  "location": per_data.get('personal', {}).get('Address', 'Not found'),
329
  "linkedin": linkedin_links,
330
  "github": github_links,
331
+ "other_links": hyperlinks
332
  },
333
  "professional": {
334
  "technical_skills": pro_data.get('professional', {}).get('technical_skills', 'Not found'),
 
353
  ]
354
  }
355
  }
356
+
 
357
  valid_contact, invalid_contact, valid_email, invalid_email = validate_contact_email(result['personal'])
358
  result['personal']['valid_contact'] = valid_contact
359
  result['personal']['invalid_contact'] = invalid_contact
360
  result['personal']['valid_email'] = valid_email
361
  result['personal']['invalid_email'] = invalid_email
362
+
 
363
  if per_data or pro_data:
364
+ print("---------LLM (Groq)-------")
 
 
365
  return result
366
  else:
367
+ raise ValueError("LLM returned no output")
368
+
 
 
 
 
369
  except Exception as e:
370
+ logging.error(f"LLM failed: {e}. Falling back to SpaCy.")
371
+
 
 
 
372
  print("---------SpaCy-------")
373
  return Parser_from_model(file_path)