| |
| import numpy as np |
| import pandas as pd |
| import matplotlib.pyplot as plt |
| import seaborn as sns |
| import joblib |
|
|
| from sklearn.model_selection import train_test_split, GridSearchCV |
| from sklearn.preprocessing import StandardScaler, LabelEncoder |
| from sklearn.linear_model import LogisticRegression |
| from sklearn.tree import DecisionTreeClassifier |
| from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier |
| from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay |
|
|
| |
| df = pd.read_csv("dataset.csv") |
| print(df.head(), df.info(), df.isnull().sum(), df.describe()) |
|
|
| |
| df.fillna(df.mean(), inplace=True) |
| df.fillna(df.mode().iloc[0], inplace=True) |
|
|
| le = LabelEncoder() |
| df['target'] = le.fit_transform(df['target']) |
| df = pd.get_dummies(df, drop_first=True) |
|
|
| X = df.drop(columns=['target']) |
| y = df['target'] |
|
|
| X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y) |
|
|
| scaler = StandardScaler() |
| X_train = scaler.fit_transform(X_train) |
| X_test = scaler.transform(X_test) |
|
|
| |
| models = { |
| "Logistic Regression": LogisticRegression(), |
| "Decision Tree": DecisionTreeClassifier(), |
| "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42), |
| "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42) |
| } |
|
|
| predictions = {} |
| for name, model in models.items(): |
| model.fit(X_train, y_train) |
| predictions[name] = model.predict(X_test) |
|
|
| |
| def evaluate_model(name, y_true, y_pred): |
| print(f"=== {name} ===") |
| print("Accuracy:", accuracy_score(y_true, y_pred)) |
| print("Classification Report:\n", classification_report(y_true, y_pred)) |
| print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred)) |
| print("\n") |
|
|
| for name, y_pred in predictions.items(): |
| evaluate_model(name, y_test, y_pred) |
|
|
| |
| param_grid = { |
| 'n_estimators': [50, 100, 200], |
| 'max_depth': [None, 10, 20], |
| 'min_samples_split': [2, 5, 10] |
| } |
|
|
| grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, scoring='accuracy', n_jobs=-1) |
| grid_search.fit(X_train, y_train) |
|
|
| print("Лучшие параметры:", grid_search.best_params_) |
| best_model = grid_search.best_estimator_ |
| y_pred_best = best_model.predict(X_test) |
| evaluate_model("Best Random Forest", y_test, y_pred_best) |
|
|
| |
| fig, ax = plt.subplots(figsize=(5, 5)) |
| ConfusionMatrixDisplay.from_estimator(best_model, X_test, y_test, ax=ax, cmap='Blues') |
| plt.show() |
|
|
| |
| joblib.dump(best_model, "best_model.pkl") |
| loaded_model = joblib.load("best_model.pkl") |