applied-ml-project / preprocessor.py
riyadhrazzaq's picture
add alvaro's code
bd9ee57
import joblib
import numpy as np
import config
rank_to_ordinal = {
'POM': 2,
'POF': 2,
'PO': 2,
'PSA': 2,
'SGT': 4,
'SSA': 4,
'SDS': 4,
'DT3': 3,
'DT2': 3,
'DT1': 3,
'DTS': 3,
'LT': 5,
'LSA': 5,
'LCD': 5,
'CPT': 6,
'SRG': 9,
'SCS': 10,
'DCS': 10,
'DI': 7,
'INS': 8,
'DC': 10,
'AC': 11,
'COD': 13,
'CCA': 13,
'CCT': 13,
'CD': 13,
'CMS': 13,
'COH': 13,
'COI': 13,
'COP': 13,
'COS': 13,
'COT': 13,
'CPB': 13,
'DET': 3,
'SGT DS': 4,
'LT SA': 5,
'LT CD': 5,
'SGT SA': 4,
'INSP': 8,
'LT.': 5,
'CHIEF': 13,
'DT': 3
}
def transform_incident_rank(l2_norms, incident_rank):
return np.array((rank_to_ordinal[incident_rank] / l2_norms["incident_rank"],))
def transform_current_rank(l2_norms, current_rank):
return np.array((rank_to_ordinal[current_rank] / l2_norms["current_rank"],))
def transform_previous_complaints(l2_norms, previous_complaints):
x = int(previous_complaints)
x = x / l2_norms["previous_complaints"]
return np.array((x,))
def transform_complaint_duration_days(l2_norms, complaint_duration_days):
x = int(complaint_duration_days)
x = x / l2_norms["complaint_duration_days"]
return np.array((x,))
def transform_days_on_force(l2_norms, days_on_force):
x = int(days_on_force)
x = x / l2_norms["days_on_force"]
return np.array((x,))
def transform_to_ohe(column_name, value, options=None):
if options is None:
options = config.features_and_options[column_name]
hot = options.index(value)
one_hot = np.zeros(len(options))
one_hot[hot] = 1
return one_hot
def process_officer_race(model_name, current_rank, incident_rank, previous_complaints, complaint_duration_days,
days_on_force, officer_gender, fado_type, allegation, ccrb_disposition,
penalty_rec, penalty_cat, location_type, contact_outcome,
impacted_gender, impacted_race,
incident_precinct):
l2_norms = config.officer_race_l2_norm['undersampling' if 'Undersampling' in model_name else 'no_undersampling']
current_rank = transform_current_rank(l2_norms, current_rank)
incident_rank = transform_incident_rank(l2_norms, incident_rank)
previous_complaints = transform_previous_complaints(l2_norms, previous_complaints)
complaint_duration_days = transform_complaint_duration_days(l2_norms, complaint_duration_days)
days_on_force = transform_days_on_force(l2_norms, days_on_force)
officer_gender = transform_to_ohe('OfficerGender', officer_gender)
fado_type = transform_to_ohe('FADOType', fado_type)
allegation = transform_to_ohe('Allegation', allegation)
ccrb_disposition = transform_to_ohe('CCRBDisposition', ccrb_disposition)
penalty_rec = transform_to_ohe('PenaltyRec', penalty_rec)
penalty_cat = transform_to_ohe('PenaltyCat', penalty_cat)
location_type = transform_to_ohe('LocationType', location_type)
contact_outcome = transform_to_ohe('ContactOutcome', contact_outcome)
impacted_gender = transform_to_ohe("ImpactedGender", impacted_gender)
impacted_race = transform_to_ohe("ImpactedRace", impacted_race)
incident_precinct = transform_to_ohe("IncidentPrecinct", incident_precinct)
input_array = np.concatenate((current_rank,
incident_rank,
days_on_force,
previous_complaints,
complaint_duration_days,
officer_gender,
fado_type,
allegation,
ccrb_disposition,
penalty_rec,
penalty_cat,
location_type,
contact_outcome,
impacted_gender,
impacted_race,
incident_precinct), dtype=np.float32)
input_array = input_array.reshape(1, -1)
return input_array
def process_officer_gender(model_name, current_rank, incident_rank, previous_complaints, complaint_duration_days,
officer_race, days_on_force, fado_type, allegation, ccrb_disposition,
penalty_rec, penalty_cat, location_type, contact_outcome,
impacted_gender, impacted_race,
incident_precinct):
l2_norms = config.officer_gender_l2_norm['undersampling' if 'Undersampling' in model_name else 'no_undersampling']
current_rank = transform_current_rank(l2_norms, current_rank)
incident_rank = transform_incident_rank(l2_norms, incident_rank)
previous_complaints = transform_previous_complaints(l2_norms, previous_complaints)
complaint_duration_days = transform_complaint_duration_days(l2_norms, complaint_duration_days)
officer_race = transform_to_ohe('OfficerRace', officer_race, config.features_and_options_target_gender["OfficerRace"])
days_on_force = transform_days_on_force(l2_norms, days_on_force)
fado_type = transform_to_ohe('FADOType', fado_type)
allegation = transform_to_ohe('Allegation', allegation)
ccrb_disposition = transform_to_ohe('CCRBDisposition', ccrb_disposition)
penalty_rec = transform_to_ohe('PenaltyRec', penalty_rec)
penalty_cat = transform_to_ohe('PenaltyCat', penalty_cat)
location_type = transform_to_ohe('LocationType', location_type)
contact_outcome = transform_to_ohe('ContactOutcome', contact_outcome)
impacted_gender = transform_to_ohe("ImpactedGender", impacted_gender)
impacted_race = transform_to_ohe("ImpactedRace", impacted_race)
incident_precinct = transform_to_ohe("IncidentPrecinct", incident_precinct)
arrays = (current_rank,
incident_rank,
days_on_force,
previous_complaints,
complaint_duration_days,
officer_race,
fado_type,
allegation,
ccrb_disposition,
penalty_rec,
penalty_cat,
location_type,
contact_outcome,
impacted_gender,
impacted_race,
incident_precinct)
input_array = np.concatenate(arrays, dtype=np.float32)
input_array = input_array.reshape(1, -1)
return input_array
def process_penalty_cat(model_name, current_rank, incident_rank, previous_complaints, complaint_duration_days,
officer_gender, officer_race, days_on_force, fado_type, allegation, location_type, contact_outcome,
impacted_gender, impacted_race,
incident_precinct):
officer_race = transform_to_ohe('OfficerRace', officer_race,
config.features_and_options_target_gender["OfficerRace"])
fado_type = transform_to_ohe('FADOType', fado_type)
allegation = transform_to_ohe('Allegation', allegation)
location_type = transform_to_ohe('LocationType', location_type)
contact_outcome = transform_to_ohe('ContactOutcome', contact_outcome)
impacted_gender = transform_to_ohe("ImpactedGender", impacted_gender)
impacted_race = transform_to_ohe("ImpactedRace", impacted_race)
incident_precinct = transform_to_ohe("IncidentPrecinct", incident_precinct)
l2_norms = config.penalty_cat_l2_norm['undersampling' if 'Undersampling' in model_name else 'no_undersampling']
current_rank = transform_current_rank(l2_norms, current_rank)
incident_rank = transform_incident_rank(l2_norms, incident_rank)
previous_complaints = transform_previous_complaints(l2_norms, previous_complaints)
complaint_duration_days = transform_complaint_duration_days(l2_norms, complaint_duration_days)
officer_gender = transform_to_ohe('OfficerGender', officer_gender)
days_on_force = transform_days_on_force(l2_norms, days_on_force)
arrays = (current_rank,
incident_rank,
days_on_force,
previous_complaints,
complaint_duration_days,
officer_gender,
officer_race,
fado_type,
allegation,
location_type,
contact_outcome,
impacted_gender,
impacted_race,
incident_precinct)
input_array = np.concatenate(arrays, dtype=np.float32)
input_array = input_array.reshape(1, -1)
if model_name in ["Neural Network.pth", "Logistic Regression Balanced"]:
scaler = joblib.load(f"models/PenaltyCat/scaler_model.pkl")
print(scaler.feature_names_in_)
input_array = scaler.transform(input_array)
return input_array