| import pandas as pd | |
| import numpy as np | |
| # Load original dataset | |
| df = pd.read_csv('ai_hiring_audit_dataset.csv') | |
| # Set seed for reproducibility | |
| np.random.seed(42) | |
| # Define demographics | |
| genders = ['Male', 'Female', 'Non-Binary'] | |
| races = ['White', 'Black', 'Asian', 'Hispanic', 'Other'] | |
| # Assign demographics randomly | |
| df['Gender'] = np.random.choice(genders, size=len(df), p=[0.48, 0.48, 0.04]) | |
| df['Race'] = np.random.choice(races, size=len(df), p=[0.6, 0.15, 0.15, 0.07, 0.03]) | |
| # Introduce some synthetic bias if not present | |
| # Let's say AI_Decision is slightly biased against Females in Software Engineering | |
| mask = (df['Gender'] == 'Female') & (df['Job_Category'] == 'Software Engineer') | |
| # Randomly flip some 1s to 0s for AI_Decision in this group | |
| to_flip = df[mask & (df['AI_Decision'] == 1)].sample(frac=0.3, random_state=42).index | |
| df.loc[to_flip, 'AI_Decision'] = 0 | |
| # Save enriched dataset | |
| df.to_csv('hiring_data_enriched.csv', index=False) | |
| print("Enriched dataset saved as hiring_data_enriched.csv") | |