19arjun89 commited on
Commit
8bd38c7
·
verified ·
1 Parent(s): 56f2260

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +144 -2
app.py CHANGED
@@ -9,6 +9,7 @@ from langchain.prompts import PromptTemplate
9
  from typing import List, Dict
10
  import os
11
  import tempfile
 
12
 
13
  # Initialize embeddings
14
  embeddings = HuggingFaceEmbeddings()
@@ -33,6 +34,82 @@ llm = ChatGroq(
33
  temperature = 0,seed = 42
34
  )
35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  def process_candidate_submission(resume_file, job_description: str) -> str:
37
  # Load and process resume
38
  if resume_file.name.endswith('.pdf'):
@@ -41,6 +118,7 @@ def process_candidate_submission(resume_file, job_description: str) -> str:
41
  loader = UnstructuredFileLoader(resume_file.name)
42
 
43
  resume_doc = loader.load()[0]
 
44
 
45
  # Create proper prompt template
46
  prompt_template = PromptTemplate(
@@ -65,7 +143,7 @@ def process_candidate_submission(resume_file, job_description: str) -> str:
65
  )
66
 
67
  response = chain.run({
68
- "resume_text": resume_doc.page_content,
69
  "job_description": job_description
70
  })
71
 
@@ -106,14 +184,23 @@ def store_resumes(resume_files: List[tempfile._TemporaryFileWrapper]) -> str:
106
  loader = UnstructuredFileLoader(file.name)
107
  docs = loader.load()
108
 
 
 
 
 
 
 
 
 
109
  # Extract filename without extension as resume ID
110
  resume_id = os.path.splitext(os.path.basename(file.name))[0]
111
 
112
  # Add metadata to each chunk
113
- splits = text_splitter.split_documents(docs)
114
  for split in splits:
115
  split.metadata["resume_id"] = resume_id
116
  split.metadata["source"] = "resume"
 
117
 
118
  all_docs.extend(splits)
119
 
@@ -204,6 +291,49 @@ def self_correct_recommendation(original_recommendation: str, verification_issue
204
  "source_docs": "\n---\n".join(source_docs)
205
  })
206
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
207
 
208
  def analyze_candidates(job_description: str) -> str:
209
  # First extract required skills from job description
@@ -407,6 +537,15 @@ def analyze_candidates(job_description: str) -> str:
407
  else:
408
  revision_note = ""
409
 
 
 
 
 
 
 
 
 
 
410
  # Add verification warnings if factuality score < 0.95
411
  verification_notes = ""
412
  if culture_verification["factuality_score"] < 0.95 or skills_verification["factuality_score"] < 0.95:
@@ -434,6 +573,9 @@ def analyze_candidates(job_description: str) -> str:
434
 
435
  HIRING RECOMMENDATION:
436
  {final_recommendation}{revision_note}{verification_notes}
 
 
 
437
 
438
  ----------------------------------------
439
  """)
 
9
  from typing import List, Dict
10
  import os
11
  import tempfile
12
+ import re
13
 
14
  # Initialize embeddings
15
  embeddings = HuggingFaceEmbeddings()
 
34
  temperature = 0,seed = 42
35
  )
36
 
37
+ def anonymize_resume_text(text: str):
38
+ """
39
+ Heuristic redaction to remove common personal identifiers from resumes
40
+ (email, phone, URLs, addresses, demographic fields, and likely name header).
41
+ Returns: (sanitized_text, redaction_notes_list)
42
+ """
43
+ redactions = []
44
+ sanitized = text
45
+
46
+ # Email addresses
47
+ sanitized2 = re.sub(r'[\w\.-]+@[\w\.-]+\.\w+', '[REDACTED_EMAIL]', sanitized)
48
+ if sanitized2 != sanitized:
49
+ redactions.append("Email addresses removed")
50
+ sanitized = sanitized2
51
+
52
+ # Phone numbers (broad heuristic)
53
+ sanitized2 = re.sub(r'(\+?\d[\d\-\(\)\s]{7,}\d)', '[REDACTED_PHONE]', sanitized)
54
+ if sanitized2 != sanitized:
55
+ redactions.append("Phone numbers removed")
56
+ sanitized = sanitized2
57
+
58
+ # URLs
59
+ sanitized2 = re.sub(r'(https?://\S+|www\.\S+)', '[REDACTED_URL]', sanitized)
60
+ if sanitized2 != sanitized:
61
+ redactions.append("URLs removed")
62
+ sanitized = sanitized2
63
+
64
+ # Physical addresses (heuristic)
65
+ address_patterns = [
66
+ r'\b\d{1,6}\s+\w+(?:\s+\w+){0,4}\s+(Street|St|Avenue|Ave|Road|Rd|Boulevard|Blvd|Lane|Ln|Drive|Dr|Court|Ct|Way|Parkway|Pkwy)\b\.?',
67
+ r'\b(Apt|Apartment|Unit|Suite|Ste)\s*#?\s*\w+\b',
68
+ r'\b\d{5}(?:-\d{4})?\b' # US ZIP
69
+ ]
70
+ for pat in address_patterns:
71
+ sanitized2 = re.sub(pat, '[REDACTED_ADDRESS]', sanitized, flags=re.IGNORECASE)
72
+ if sanitized2 != sanitized:
73
+ redactions.append("Address/location identifiers removed")
74
+ sanitized = sanitized2
75
+
76
+ # Explicit demographic fields
77
+ demographic_patterns = [
78
+ r'\b(gender|sex)\s*:\s*\w+\b',
79
+ r'\b(age)\s*:\s*\d+\b',
80
+ r'\b(dob|date of birth)\s*:\s*[\w\s,/-]+\b',
81
+ r'\b(marital status)\s*:\s*\w+\b',
82
+ r'\b(nationality)\s*:\s*\w+\b',
83
+ r'\b(citizenship)\s*:\s*[\w\s,/-]+\b',
84
+ r'\b(pronouns?)\s*:\s*[\w/]+\b',
85
+ ]
86
+ for pat in demographic_patterns:
87
+ sanitized2 = re.sub(pat, '[REDACTED_DEMOGRAPHIC]', sanitized, flags=re.IGNORECASE)
88
+ if sanitized2 != sanitized:
89
+ redactions.append("Explicit demographic fields removed")
90
+ sanitized = sanitized2
91
+
92
+ # Likely name header masking (first line)
93
+ lines = sanitized.splitlines()
94
+ if lines:
95
+ first_line = lines[0].strip()
96
+ if re.fullmatch(r"[A-Za-z]+(?:\s+[A-Za-z]+){1,3}", first_line):
97
+ lines[0] = "[REDACTED_NAME]"
98
+ sanitized = "\n".join(lines)
99
+ redactions.append("Likely name header removed")
100
+
101
+ # Cleanup
102
+ sanitized = re.sub(r'\n{3,}', '\n\n', sanitized).strip()
103
+ redactions = sorted(set(redactions))
104
+
105
+ return sanitized, redactions
106
+
107
+
108
+ def join_loaded_docs_text(docs):
109
+ """Combine a list of LangChain Documents into a single text blob."""
110
+ return "\n".join([d.page_content for d in docs if getattr(d, "page_content", None)])
111
+
112
+
113
  def process_candidate_submission(resume_file, job_description: str) -> str:
114
  # Load and process resume
115
  if resume_file.name.endswith('.pdf'):
 
118
  loader = UnstructuredFileLoader(resume_file.name)
119
 
120
  resume_doc = loader.load()[0]
121
+ sanitized_resume_text, _ = anonymize_resume_text(resume_doc.page_content)
122
 
123
  # Create proper prompt template
124
  prompt_template = PromptTemplate(
 
143
  )
144
 
145
  response = chain.run({
146
+ "resume_text": sanitized_resume_text,
147
  "job_description": job_description
148
  })
149
 
 
184
  loader = UnstructuredFileLoader(file.name)
185
  docs = loader.load()
186
 
187
+ # Combine + anonymize before splitting
188
+ raw_text = join_loaded_docs_text(docs)
189
+ sanitized_text, redactions = anonymize_resume_text(raw_text)
190
+
191
+ # Create a single Document to split
192
+ from langchain.schema import Document
193
+ base_doc = Document(page_content=sanitized_text, metadata={})
194
+
195
  # Extract filename without extension as resume ID
196
  resume_id = os.path.splitext(os.path.basename(file.name))[0]
197
 
198
  # Add metadata to each chunk
199
+ splits = text_splitter.split_documents([base_doc])
200
  for split in splits:
201
  split.metadata["resume_id"] = resume_id
202
  split.metadata["source"] = "resume"
203
+ split.metadata["sanitized"] = True
204
 
205
  all_docs.extend(splits)
206
 
 
291
  "source_docs": "\n---\n".join(source_docs)
292
  })
293
 
294
+ bias_audit_prompt = PromptTemplate(
295
+ input_variables=["skills_analysis", "culture_analysis", "final_recommendation", "job_desc", "culture_docs"],
296
+ template="""Review the following candidate evaluation for potential bias:
297
+
298
+ SKILLS ANALYSIS:
299
+ {skills_analysis}
300
+
301
+ CULTURE ANALYSIS:
302
+ {culture_analysis}
303
+
304
+ FINAL RECOMMENDATION:
305
+ {final_recommendation}
306
+
307
+ REFERENCE MATERIALS (source of truth):
308
+ Job Description:
309
+ {job_desc}
310
+
311
+ Culture Documents:
312
+ {culture_docs}
313
+
314
+ Check specifically for:
315
+ - Over-reliance on education pedigree or past employers over actual skills
316
+ - Penalizing nontraditional career paths
317
+ - Use of subjective or exclusionary language in cultural fit
318
+ - Reasoning not supported by job description or culture documents
319
+
320
+ Output format (exactly):
321
+ BIAS AUDIT RESULT:
322
+ - Bias Indicators: [List any concerns or 'None Detected']
323
+ - Transparency Note: [Short note for recruiter if concerns exist]
324
+ """
325
+ )
326
+
327
+ def run_bias_audit(skills_analysis, culture_analysis, final_recommendation, job_desc, culture_docs):
328
+ chain = LLMChain(llm=llm, prompt=bias_audit_prompt)
329
+ return chain.run({
330
+ "skills_analysis": skills_analysis,
331
+ "culture_analysis": culture_analysis,
332
+ "final_recommendation": final_recommendation,
333
+ "job_desc": job_desc,
334
+ "culture_docs": culture_docs
335
+ })
336
+
337
 
338
  def analyze_candidates(job_description: str) -> str:
339
  # First extract required skills from job description
 
537
  else:
538
  revision_note = ""
539
 
540
+ # Bias audit (triangulates across skills, culture, and final recommendation)
541
+ bias_audit = run_bias_audit(
542
+ skills_analysis=skills_fit,
543
+ culture_analysis=culture_fit,
544
+ final_recommendation=final_recommendation,
545
+ job_desc=job_description,
546
+ culture_docs=culture_context
547
+ )
548
+
549
  # Add verification warnings if factuality score < 0.95
550
  verification_notes = ""
551
  if culture_verification["factuality_score"] < 0.95 or skills_verification["factuality_score"] < 0.95:
 
573
 
574
  HIRING RECOMMENDATION:
575
  {final_recommendation}{revision_note}{verification_notes}
576
+
577
+ BIAS AUDIT:
578
+ {bias_audit}
579
 
580
  ----------------------------------------
581
  """)