Niketjain2002 commited on
Commit
62df3aa
·
verified ·
1 Parent(s): b7e194c

Upload src/bias_guard.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. src/bias_guard.py +197 -0
src/bias_guard.py ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Bias Mitigation Layer
3
+
4
+ Scans inputs, features, and outputs for potential bias risks.
5
+ Flags issues without blocking — humans make final decisions.
6
+
7
+ Principles:
8
+ - Never use demographic traits (age, gender, race, ethnicity, nationality)
9
+ - Never use university prestige as a direct signal
10
+ - Never use name-based inferences
11
+ - Flag proxy variables that correlate with protected characteristics
12
+ - Log all flagged items for audit
13
+ """
14
+
15
+ import re
16
+ from dataclasses import dataclass, field
17
+
18
+
19
+ @dataclass
20
+ class BiasFlag:
21
+ category: str # demographic_leak, proxy_variable, prestige_bias, etc.
22
+ severity: str # info, warning, critical
23
+ description: str
24
+ field_path: str # JSON path to the flagged field
25
+ recommendation: str
26
+
27
+
28
+ @dataclass
29
+ class BiasAuditResult:
30
+ flags: list[BiasFlag] = field(default_factory=list)
31
+ passed: bool = True
32
+ summary: str = ""
33
+
34
+ def add_flag(self, flag: BiasFlag):
35
+ self.flags.append(flag)
36
+ if flag.severity == "critical":
37
+ self.passed = False
38
+
39
+
40
+ # Known prestige indicators that should not influence scoring
41
+ PRESTIGE_INDICATORS = [
42
+ r"\b(ivy league|oxbridge|russell group)\b",
43
+ r"\b(harvard|stanford|mit|princeton|yale|columbia|caltech|oxford|cambridge)\b",
44
+ r"\b(top\s*\d+\s*(university|school|program))\b",
45
+ r"\b(elite|prestigious|tier[- ]?1|world[- ]?class)\b.*\b(university|school|institution)\b",
46
+ ]
47
+
48
+ # Demographic signal patterns that must never be used
49
+ DEMOGRAPHIC_PATTERNS = [
50
+ (r"\b(age|born|year of birth|dob)\s*[:=]\s*\d+", "age_signal"),
51
+ (r"\b(gender|sex)\s*[:=]\s*\w+", "gender_signal"),
52
+ (r"\b(race|ethnicity|national origin)\s*[:=]", "race_signal"),
53
+ (r"\b(married|single|divorced|children|pregnant)", "family_status"),
54
+ (r"\b(disability|disabled|handicap)", "disability_signal"),
55
+ (r"\b(veteran|military service)\b", "veteran_status"), # info only, not critical
56
+ (r"\b(religion|religious|church|mosque|temple|synagogue)\b", "religion_signal"),
57
+ ]
58
+
59
+ # Proxy variables that may correlate with protected characteristics
60
+ PROXY_PATTERNS = [
61
+ (r"\b(graduation year|class of \d{4})\b", "age_proxy",
62
+ "Graduation year can be used to infer age. Do not use in scoring."),
63
+ (r"\b(native speaker|native english|accent)\b", "national_origin_proxy",
64
+ "Language nativity can proxy for national origin. Focus on communication skill evidence instead."),
65
+ (r"\b(cultural fit)\b", "affinity_proxy",
66
+ "'Cultural fit' is a known proxy for in-group bias. Use 'values alignment' with specific criteria instead."),
67
+ (r"\b(commute|neighborhood|zip code|postal code)\b", "socioeconomic_proxy",
68
+ "Location granularity can proxy for socioeconomic status."),
69
+ ]
70
+
71
+
72
+ class BiasGuard:
73
+ """Scans for and flags potential bias in inputs and outputs."""
74
+
75
+ def audit_inputs(self, job_description: str, resume_text: str) -> BiasAuditResult:
76
+ """Scan raw inputs for bias risks before processing."""
77
+ result = BiasAuditResult()
78
+
79
+ # Check JD for biased language
80
+ self._check_demographic_signals(job_description, "job_description", result)
81
+ self._check_proxy_variables(job_description, "job_description", result)
82
+
83
+ # Check resume for demographic leaks
84
+ self._check_demographic_signals(resume_text, "resume", result)
85
+
86
+ result.summary = self._summarize(result)
87
+ return result
88
+
89
+ def audit_features(self, role_features: dict, candidate_features: dict) -> BiasAuditResult:
90
+ """Scan extracted features for bias risks."""
91
+ result = BiasAuditResult()
92
+
93
+ # Check if education prestige leaked into features
94
+ edu = candidate_features.get("education", [])
95
+ for i, entry in enumerate(edu):
96
+ inst = entry.get("institution", "")
97
+ for pattern in PRESTIGE_INDICATORS:
98
+ if re.search(pattern, inst, re.IGNORECASE):
99
+ result.add_flag(BiasFlag(
100
+ category="prestige_bias",
101
+ severity="warning",
102
+ description=f"University prestige detected: '{inst}'. Must not influence scoring.",
103
+ field_path=f"candidate_features.education[{i}].institution",
104
+ recommendation="Use degree field and level only, not institution name.",
105
+ ))
106
+
107
+ # Check for age-inferrable data
108
+ positions = candidate_features.get("experience_profile", {}).get("positions", [])
109
+ if positions:
110
+ earliest = min(
111
+ (p.get("start_year", 9999) for p in positions if p.get("start_year")),
112
+ default=9999,
113
+ )
114
+ if earliest < 9999:
115
+ result.add_flag(BiasFlag(
116
+ category="age_proxy",
117
+ severity="info",
118
+ description=f"Earliest career date ({earliest}) can be used to infer age.",
119
+ field_path="candidate_features.experience_profile.positions",
120
+ recommendation="Use total_years and relevant_years for scoring, not start dates.",
121
+ ))
122
+
123
+ result.summary = self._summarize(result)
124
+ return result
125
+
126
+ def audit_output(self, final_output: dict) -> BiasAuditResult:
127
+ """Scan final output for bias in reasoning."""
128
+ result = BiasAuditResult()
129
+
130
+ # Scan reasoning text for prestige or demographic mentions
131
+ reasoning = final_output.get("reasoning_summary", "")
132
+ signals = (
133
+ final_output.get("positive_signals", [])
134
+ + final_output.get("risk_signals", [])
135
+ )
136
+ all_text = reasoning + " ".join(signals)
137
+
138
+ for pattern in PRESTIGE_INDICATORS:
139
+ if re.search(pattern, all_text, re.IGNORECASE):
140
+ result.add_flag(BiasFlag(
141
+ category="prestige_bias",
142
+ severity="critical",
143
+ description="University prestige mentioned in output reasoning.",
144
+ field_path="reasoning/signals",
145
+ recommendation="Remove prestige references. Use skill and experience evidence only.",
146
+ ))
147
+
148
+ for pattern, signal_type in DEMOGRAPHIC_PATTERNS:
149
+ if re.search(pattern, all_text, re.IGNORECASE):
150
+ result.add_flag(BiasFlag(
151
+ category="demographic_leak",
152
+ severity="critical",
153
+ description=f"Demographic signal ({signal_type}) found in output.",
154
+ field_path="reasoning/signals",
155
+ recommendation="Remove all demographic references from output.",
156
+ ))
157
+
158
+ result.summary = self._summarize(result)
159
+ return result
160
+
161
+ def _check_demographic_signals(self, text: str, source: str, result: BiasAuditResult):
162
+ for pattern, signal_type in DEMOGRAPHIC_PATTERNS:
163
+ if re.search(pattern, text, re.IGNORECASE):
164
+ severity = "info" if signal_type == "veteran_status" else "warning"
165
+ result.add_flag(BiasFlag(
166
+ category="demographic_leak",
167
+ severity=severity,
168
+ description=f"Demographic signal ({signal_type}) detected in {source}.",
169
+ field_path=source,
170
+ recommendation=f"Ensure {signal_type} is not used in scoring.",
171
+ ))
172
+
173
+ def _check_proxy_variables(self, text: str, source: str, result: BiasAuditResult):
174
+ for pattern, proxy_type, recommendation in PROXY_PATTERNS:
175
+ if re.search(pattern, text, re.IGNORECASE):
176
+ result.add_flag(BiasFlag(
177
+ category="proxy_variable",
178
+ severity="warning",
179
+ description=f"Proxy variable ({proxy_type}) detected in {source}.",
180
+ field_path=source,
181
+ recommendation=recommendation,
182
+ ))
183
+
184
+ def _summarize(self, result: BiasAuditResult) -> str:
185
+ if not result.flags:
186
+ return "No bias risks detected."
187
+ critical = sum(1 for f in result.flags if f.severity == "critical")
188
+ warnings = sum(1 for f in result.flags if f.severity == "warning")
189
+ info = sum(1 for f in result.flags if f.severity == "info")
190
+ parts = []
191
+ if critical:
192
+ parts.append(f"{critical} critical")
193
+ if warnings:
194
+ parts.append(f"{warnings} warnings")
195
+ if info:
196
+ parts.append(f"{info} info")
197
+ return f"Bias audit: {', '.join(parts)} flag(s) found."