Spaces:
Build error
Build error
| import pandas as pd | |
| import numpy as np | |
| import seaborn as sns | |
| import matplotlib.pyplot as plt | |
| import warnings | |
| from sklearn.preprocessing import LabelEncoder | |
| from sklearn.ensemble import RandomForestClassifier | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.metrics import accuracy_score | |
| import joblib | |
| warnings.filterwarnings("ignore") | |
| # Load and preprocess data | |
| data = pd.read_csv("dataset/insurance_claims.csv").drop(columns="_c39") | |
| data.replace('?', np.nan, inplace=True) | |
| # Function to check data | |
| def check_data(data): | |
| return pd.DataFrame({ | |
| 'type': data.dtypes, | |
| 'amount_unique': data.nunique(), | |
| 'unique_values': [data[x].unique() for x in data.columns], | |
| 'null_values': data.isna().sum(), | |
| 'percentage_null_values(%)': round((data.isnull().sum() / data.shape[0]) * 100, 2) | |
| }) | |
| print(check_data(data).sort_values("null_values", ascending=False)) | |
| # Fill missing values with mode | |
| for column in data.columns: | |
| mode_value = data[column].mode().iloc[0] | |
| data[column] = data[column].replace(np.nan, mode_value) | |
| # Encode categorical variables | |
| le = LabelEncoder() | |
| for col in data.columns: | |
| if data[col].dtype == 'O': | |
| data[col] = le.fit_transform(data[col]) | |
| # Drop less important columns | |
| to_drop = ['policy_number', 'policy_bind_date', 'insured_zip', 'incident_location', | |
| 'auto_year', 'auto_make', 'auto_model'] | |
| data.drop(columns=to_drop, inplace=True) | |
| # Correlation heatmap | |
| plt.figure(figsize=(23, 23)) | |
| corr_matrix = data.corr() | |
| mask = np.triu(np.ones_like(corr_matrix, dtype=bool)) | |
| sns.heatmap(round(corr_matrix, 2), mask=mask, vmin=-1, vmax=1, annot=True, cmap='magma') | |
| plt.title('Triangle Correlation Heatmap', fontsize=18, pad=16) | |
| plt.show() | |
| # Drop less correlated features | |
| to_drop = ['injury_claim', 'property_claim', 'vehicle_claim', 'incident_type', 'age', | |
| 'incident_hour_of_the_day', 'insured_occupation'] | |
| data.drop(columns=to_drop, inplace=True) | |
| # Feature importance | |
| X = data.iloc[:, :-1] | |
| Y = data['fraud_reported'] | |
| model = RandomForestClassifier(n_estimators=1000) | |
| model.fit(X, Y) | |
| feat_importances = pd.Series(model.feature_importances_, index=X.columns) | |
| final_feat = feat_importances.nlargest(10).index.tolist() | |
| final_feat.append('fraud_reported') | |
| data_new = data[final_feat] | |
| # Prepare data for modeling | |
| df_model = data_new.copy() | |
| X = df_model.drop(columns='fraud_reported') | |
| y = df_model['fraud_reported'] | |
| X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=43) | |
| # Train the final model | |
| final_model = RandomForestClassifier( | |
| criterion='gini', | |
| max_depth=5, | |
| min_samples_leaf=4, | |
| min_samples_split=10, | |
| n_estimators=100, | |
| random_state=42, | |
| class_weight='balanced' | |
| ) | |
| final_model.fit(X_train, y_train) | |
| # Evaluate the model | |
| y_pred = final_model.predict(X_test) | |
| accuracy = accuracy_score(y_test, y_pred) | |
| print(f"Model Accuracy: {accuracy}") | |
| # Save the model | |
| joblib.dump(final_model, 'model/only_model.joblib') | |
| print("Model saved successfully.") | |