Spaces:
Runtime error
Runtime error
File size: 7,337 Bytes
217da35 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 |
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import _VectorizerMixin
from sklearn.feature_selection._base import SelectorMixin
from sklearn.pipeline import Pipeline
def merge_category(x):
if x == "Human Trafficking (A), Commercial Sex Acts":
return "Human Trafficking"
elif x == "Human Trafficking (B), Involuntary Servitude":
return "Human Trafficking"
elif x == "Human Trafficking, Commercial Sex Acts":
return "Human Trafficking"
elif x == "Weapons Offence":
return "Weapons Offense"
elif x == "Drug Violation":
return "Drug Offense"
elif x == "Motor Vehicle Theft?":
return "Motor Vehicle Theft"
elif x == "Suspicious Occ":
return "Suspicious"
elif x == "Rape":
return "Sex Offense"
else:
return x
def merge_category_2(x):
if x == "Gambling":
return "Other"
elif x == "Homicide":
return "Other"
elif x == "Human Trafficking":
return "Other"
elif x == "Liquor Laws":
return "Other"
elif x == "Other Miscellaneous":
return "Other"
elif x == "Weapons Carrying Etc":
return "Weapons Offense"
elif x == "Offences Against The Family And Children":
return "Other Offenses"
elif x == "Sex Offense":
return "Other Offenses"
elif x == "Prostitution":
return "Other"
elif x == "Case Closure":
return "Other"
elif x == "Courtesy Report":
return "Other"
elif x == "Fire Report":
return "Other"
elif x == "Suicide":
return "Other"
elif x == "Embezzlement":
return "Financial Offense"
elif x == "Forgery And Counterfeiting":
return "Financial Offense"
elif x == "Fraud":
return "Financial Offense"
elif x == "Lost Property":
return "Financial Offense"
elif x == "Stolen Property":
return "Financial Offense"
elif x == "Motor Vehicle Theft":
return "Traffic and Vehicle Offense"
elif x == "Recovered Vehicle":
return "Traffic and Vehicle Offense"
elif x == "Traffic Collision":
return "Traffic and Vehicle Offense"
elif x == "Traffic Violation Arrest":
return "Traffic and Vehicle Offense"
elif x == "Vehicle Impounded":
return "Traffic and Vehicle Offense"
elif x == "Vehicle Misplaced":
return "Traffic and Vehicle Offense"
elif x == "Civil Sidewalks":
return "Traffic and Vehicle Offense"
elif x == "Burglary":
return "Theft and Robbery"
elif x == "Larceny Theft":
return "Theft and Robbery"
elif x == "Robbery":
return "Theft and Robbery"
elif x == "Arson":
return "Assault"
elif x == "Disorderly Conduct":
return "Other Offenses"
elif x == "Vandalism":
return "Malicious Mischief"
elif x == "Miscellaneous Investigation":
return "Suspicious"
else:
return x
def get_feature_out(estimator, feature_in):
if hasattr(estimator, 'get_feature_names'):
if isinstance(estimator, _VectorizerMixin):
# handling all vectorizers
return [f'vec_{f}' \
for f in estimator.get_feature_names()]
else:
return estimator.get_feature_names(feature_in)
elif isinstance(estimator, SelectorMixin):
return np.array(feature_in)[estimator.get_support()]
else:
return feature_in
def get_ct_feature_names(ct):
# handles all estimators, pipelines inside ColumnTransfomer
# doesn't work when remainder =='passthrough'
# which requires the input column names.
output_features = []
for name, estimator, features in ct.transformers_:
if name != 'remainder':
if isinstance(estimator, Pipeline):
current_features = features
for step in estimator:
current_features = get_feature_out(step, current_features)
features_out = current_features
else:
features_out = get_feature_out(estimator, features)
output_features.extend(features_out)
elif estimator == 'passthrough':
output_features.extend(ct._feature_names_in[features])
return output_features
def preprocessing_incident(incident_df):
# step 1: dropping irrelavent columns and null values
incident_df.drop(columns=['incident_date','incident_time','incident_year','report_datetime','row_id','incident_id','incident_number',
'report_type_description','filed_online','incident_code','incident_subcategory',
'incident_description','resolution','cad_number','intersection','cnn','analysis_neighborhood',
'supervisor_district','point',':@computed_region_jwn9_ihcz',':@computed_region_26cr_cadq',
':@computed_region_qgnn_b9vv',':@computed_region_nqbw_i6c3',':@computed_region_h4ep_8xdi',
':@computed_region_n4xg_c4py',':@computed_region_jg9y_a9du'], inplace=True)
incident_df.dropna(inplace=True)
# step 2: create new columns
incident_df['incident_month']=pd.to_datetime(incident_df["incident_datetime"]).dt.month
incident_df['incident_year']=pd.to_datetime(incident_df["incident_datetime"]).dt.year
incident_df['incident_hour']=pd.to_datetime(incident_df["incident_datetime"]).dt.hour
#incident_df['incident_dayofweek']=pd.to_datetime(incident_df["incident_datetime"]).dt.dayofweek
# step 3: merging labels
incident_df['incident_category']=incident_df['incident_category'].apply(merge_category)
incident_df['incident_category']=incident_df['incident_category'].apply(merge_category_2)
# step 4: onehot encoding using column Transformer Settings
t = [('ohe-cat', OneHotEncoder(sparse=False, handle_unknown='ignore'), ['incident_day_of_week', 'report_type_code','police_district']),
('do_nothing', SimpleImputer(strategy='most_frequent'), ['incident_datetime', 'incident_category', 'latitude', 'longitude', 'incident_month', 'incident_year', 'incident_hour']),
]
pre_processor = ColumnTransformer(transformers=t, remainder='drop')
incident_df_processed = pre_processor.fit_transform(X=incident_df)
# Get column names
columns = get_ct_feature_names(pre_processor)
incident_df_processed = pd.DataFrame(incident_df_processed, columns=columns)
# step 5: change column types and names
numeric_columns = incident_df_processed.columns.drop(['incident_datetime','incident_category'])
incident_df_processed[numeric_columns] = incident_df_processed[numeric_columns].apply(pd.to_numeric)
incident_df_processed['incident_datetime'] = incident_df_processed['incident_datetime'].apply(pd.to_datetime)
incident_df_processed.rename(columns={"police_district_Out of SF": "police_district_OutOfSF"},inplace=True)
return incident_df_processed |