AccidentsPrediction / dataprep.py
luismidv's picture
New commit
9b7b488
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import mutual_info_regression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import learning_curve
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.model_selection import StratifiedKFold
from sklearn.multioutput import MultiOutputClassifier
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
class DataExtractor():
def datapreparer(self, route):
self.dataset = pd.read_csv(route)
dataset_columns = self.dataset.columns
mising_col =[col for col in self.dataset.columns if self.dataset[col].isnull().sum() > 0]
if len(mising_col) > 0:
print("There is missing information")
else:
print("There is no missing data")
def data_replacing(self):
self.dataset.replace(['Country_01', 'Country_02', 'Country_03'], ['Spain', 'Germany', 'USA'], inplace = True)
self.dataset.replace(['Local_01', 'Local_02', 'Local_03'], ['Madrid', 'Berlin', 'New York'], inplace = True)
def data_visualization_sns(self):
sector_accidents = self.dataset['Industry Sector'].value_counts()
genre_accidents = self.dataset['Genre'].value_counts()
employee_third = self.dataset['Employee ou Terceiro'].value_counts()
accident_level = self.dataset['Accident Level'].value_counts()
country_accidents = self.dataset['Countries'].value_counts()
fig,axes = plt.subplots(1,5,figsize = (20,5))
axes[0].pie(sector_accidents, labels = sector_accidents.index, autopct = '%1.1f%%', colors=['gold', 'skyblue', 'lightgreen'], startangle=90)
axes[0].set_title('Accidents in each sector')
axes[1].pie(genre_accidents, labels =genre_accidents.index, autopct = '%1.1f%%', colors=['red', 'blue', ], startangle=90)
axes[1].set_title('Accidents in each genre')
axes[2].pie(employee_third, labels = employee_third.index, autopct = '%1.1f%%', colors=['green', 'purple'], startangle=90)
axes[2].set_title('Employee or third party')
axes[3].pie(accident_level, labels = accident_level.index, autopct = '%1.1f%%', colors=['orange', 'brown', 'black', 'yellow', 'pink'], startangle=90)
axes[3].set_title('Accident level')
axes[4].pie(country_accidents, labels = country_accidents.index, autopct = '%1.1f%%', colors=['orange', 'brown', 'black'], startangle=90)
axes[4].set_title('Accidents in each country')
plt.show()
def accidents_month(self):
self.dataset["Month"] = self.dataset["Data"].dt.month
monthly_accidents = self.dataset["Month"].value_counts().sort_index()
plt.figure(figsize=(10,5))
monthly_accidents.plot(kind="bar", color="skyblue")
plt.xlabel("Month")
plt.ylabel("Number of accidents")
plt.title("Number of accidents in each month")
plt.xticks(ticks=range(12), labels= ["Jan", "Feb", "Mar", "Apr", "May", "Jun","Jul", "Aug", "Sep", "Oct", "Nov", "Dec"], rotation = 0)
plt.grid(axis = "y", linestyle="--", alpha = 0.7)
plt.show()
def redundant_data(self):
print(f"Dataset size {len(self.dataset)}")
oneh = OneHotEncoder(sparse_output = False)
encoded_data = oneh.fit_transform(self.dataset)
encoded_dataframe = pd.DataFrame(encoded_data)
plt.figure(figsize = (12, 12))
sns.heatmap(np.corrcoef(encoded_data, rowvar = False) > 0.6, annot = True, cbar = False)
plt.show()
def model_charging(self):
models ={
'random_forest':{
'model':RandomForestClassifier(),
'parameters':{
'n_estimators': [50, 100, 200],
'max_depth': [None, 10, 20],
'min_samples_split': [2,5,10]
}
},
'decision_tree':{
'model':DecisionTreeClassifier(criterion = 'gini'),
'parameters':{
'max_depth': [None,10,20],
'min_samples_split': [2,5,10]
}
},
'svm':{
'model':SVC(),
'parameters':{
'C': [0.1,1,10],
'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
'gamma': ['scale', 'auto']
}
}
}
return models
def grid_preprocessor(self, models):
self.oneh = OneHotEncoder(sparse_output = False)
self.label = LabelEncoder()
kFold = StratifiedKFold(n_splits = 5)
features = self.dataset.drop('Industry Sector', axis = 'columns')
encoded_features = self.oneh.fit_transform(features)
print(f"Encoded features size {encoded_features.shape}")
encoded_dataframe = pd.DataFrame(encoded_features)
self.encoded_columns = encoded_dataframe.columns
labels = self.dataset['Industry Sector']
labels = pd.DataFrame(labels)
encoded_labels = self.label.fit_transform(labels)
scores =[]
for model_name,model_params in models.items():
print(f"Using model {model_name}")
gs = GridSearchCV(model_params['model'], model_params['parameters'], cv = 10, return_train_score = False)
gs.fit(encoded_features, encoded_labels)
scores.append({'model':model_name, 'best_parameters':gs.best_params_,'score': gs.best_score_
})
x_train, x_test, y_train, y_test = train_test_split(encoded_features, encoded_labels, test_size = 0.2, random_state = 0)
predictions = gs.predict(x_test)
return pd.DataFrame(scores, columns =['model', 'best_parameters', 'score'])
def model_selection(self,data, models):
model = data.loc[data['score'].idxmax(), 'model']
self.parameters = data.loc[data['score'].idxmax(), 'best_parameters']
self.oneh = OneHotEncoder(sparse_output = False)
self.label = LabelEncoder()
selected_model = models[model]
self.selected_model = selected_model['model']
self.selected_model.set_params(**self.parameters)
self.model_plot = self.selected_model
features = self.dataset.drop('Industry Sector', axis = 'columns')
self.encoded_features = self.oneh.fit_transform(features)
features_dataframe = pd.DataFrame(self.encoded_features)
self.encoded_columns = features_dataframe.columns
labels = self.dataset['Industry Sector']
self.encoded_labels = self.label.fit_transform(labels)
x_train, x_test, y_train, y_test = train_test_split(self.encoded_features, self.encoded_labels, test_size = 0.2, random_state = 0)
self.selected_model.fit(x_train, y_train)
joblib.dump(self.selected_model, '/tmp/model.pkl')
train_prediction = self.selected_model.predict(x_train)
self.train_error = accuracy_score(y_train, train_prediction)
predictions = self.selected_model.predict(x_test)
self.test_error = accuracy_score(y_test, predictions)
print(f"Training time output {predictions.shape}")
print(f"Error in training time {self.test_error * 100:.2f}%")
return predictions
def new_predictions(self, new_data):
new_data_encoded = pd.get_dummies(new_data)
predictions = self.selected_model.predict(new_data_encoded)
print(f"Normal prediction shape {predictions.shape}")
return predictions
def get_encoding_info(self, prediction):
output_decoded = self.label.inverse_transform(prediction)
return output_decoded
def multi_level_classification(self):
self.multi_onehot = OneHotEncoder(sparse_output = False)
self.multi_onehot_label = OneHotEncoder(sparse_output = False)
features = self.dataset.drop(['Industry Sector', 'Genre', 'Countries' , 'Employee ou Terceiro'], axis = 'columns')
print(features.columns)
labels = self.dataset[['Industry Sector', 'Genre', 'Countries' , 'Employee ou Terceiro']]
encoded_features = self.multi_onehot.fit_transform(features)
encoded_dataframe = pd.DataFrame(encoded_features)
self.encoded_multi = encoded_dataframe.columns
encoded_labels = self.multi_onehot_label.fit_transform(labels)
x_train, x_test, y_train, y_test = train_test_split(encoded_features,encoded_labels, test_size = 0.2, random_state = 0)
self.multi_model = MultiOutputClassifier(RandomForestClassifier(max_depth = 10, n_estimators = 30))
self.multi_model.fit(x_train, y_train)
predictions = self.multi_model.predict(x_test)
error = mean_squared_error(y_test,predictions)
print(f"Training data shape {predictions.shape}")
print(f"Multi level training prediction shape {predictions.shape}")
print(f"Error gotten in training time {error * 100:.2f}%")
def new_multipredictions_(self, new_data):
#new_data = new_data.drop('Industry Sector', axis = 1)
new_data_encoded = self.multi_onehot.fit_transform(new_data)
print(f"New data encoded{new_data_encoded.shape}")
predictions = self.multi_model.predict(new_data_encoded)
print(f"New predictions \n {predictions.shape}")
return predictions
def get_multiencoding_info(self, prediction):
print(f"New multi prediciton {prediction}")
output_decoded = self.multi_onehot_label.inverse_transform(prediction)
print(f"Categories: \n {output_decoded}")
def check_learning_curve(self):
train_sizes, train_scores, test_scores = learning_curve(self.model_plot, self.encoded_features, self.encoded_labels, cv = 5, scoring="accuracy",
train_sizes = np.linspace(0.1,1.0,10), n_jobs = -1)
train_mean = train_scores.mean(axis = 1)
test_mean = test_scores.mean(axis = 1)
train_std = train_scores.std(axis = 1)
test_std = test_scores.std(axis = 1)
plt.figure(figsize=(8, 6))
plt.plot(train_sizes, test_mean, label="Validation Score", color="r", marker="o")
plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.2, color="g")
plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, alpha=0.2, color="r")
plt.xlabel("Number of Training Samples")
plt.ylabel("Accuracy")
plt.title("Learning Curve: Training vs Validation")
plt.legend()
plt.show()
def data_asking(self):
date = input("Which is the date you want to consult 2016-01-01 00:00:00: ")
country_number_f = input("What is the number of the country, 1 2 or 3? :")
country_number = "Country_0" + str(country_number_f)
local_number_f = input("What is the number of the country? :")
local_number = "Local_0" + str(local_number_f)
accident_level = input("What is the accident level?")
potential = input("What is the potential?")
genre = input("Introduce the genre: ")
type = input("Third party worker or Employee: ")
riesgo_critico = input("What is the critico?")
#new_data_extractor.datapreparer(route)
#new_data_extractor.data_replacing()
#new_data_extractor.data_visualization_sns()
#models = new_data_extractor.model_charging()
#data = new_data_extractor.grid_preprocessor(models)
#new_data_extractor.model_selection(data, models)
# new_data = pd.DataFrame([[
# '2016-01-01 00:00:00', 'Country_01', 'Local_01', 'I', 'IV', 'Male', 'Third Party', 'Pressed'
# ]], columns=['Data', 'Countries', 'Local', 'Accident Level',
# 'Potential Accident Level', 'Genre', 'Employee ou Terceiro', 'Risco Critico'])
#new_data = new_data.reindex(columns = new_data_extractor.encoded_columns, fill_value = 0)
#prediction = new_data_extractor.new_predictions(new_data)
#new_data_extractor.get_encoding_info(prediction)
#new_data_extractor.check_learning_curve()
#new_data_extractor.multi_level_classification()
#new_multi = pd.DataFrame([[
# '2016-01-01 00:00:00', 'Local_01', 'I', 'IV', 'Pressed'
#]], columns=['Data', 'Local', 'Accident Level',
# 'Potential Accident Level', 'Risco Critico'])
#new_multi = new_data.reindex(columns = new_data_extractor.encoded_multi, fill_value = 0)
#multi_prediction = new_data_extractor.new_multipredictions_(new_multi)
#new_data_extractor.get_encoding_info(prediction)
#new_data_extractor.get_multiencoding_info(multi_prediction)