# 1. Установка и импорт библиотек import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import joblib from sklearn.model_selection import train_test_split, GridSearchCV from sklearn.preprocessing import StandardScaler, LabelEncoder from sklearn.linear_model import LogisticRegression from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay # 2. Загрузка и первичный анализ данных df = pd.read_csv("dataset.csv") print(df.head(), df.info(), df.isnull().sum(), df.describe()) # 3. Предобработка данных df.fillna(df.mean(), inplace=True) df.fillna(df.mode().iloc[0], inplace=True) le = LabelEncoder() df['target'] = le.fit_transform(df['target']) df = pd.get_dummies(df, drop_first=True) X = df.drop(columns=['target']) y = df['target'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y) scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) # 4. Обучение моделей models = { "Logistic Regression": LogisticRegression(), "Decision Tree": DecisionTreeClassifier(), "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42), "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42) } predictions = {} for name, model in models.items(): model.fit(X_train, y_train) predictions[name] = model.predict(X_test) # 5. Оценка качества моделей def evaluate_model(name, y_true, y_pred): print(f"=== {name} ===") print("Accuracy:", accuracy_score(y_true, y_pred)) print("Classification Report:\n", classification_report(y_true, y_pred)) print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred)) print("\n") for name, y_pred in predictions.items(): evaluate_model(name, y_test, y_pred) # 6. Подбор гиперпараметров для случайного леса param_grid = { 'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20], 'min_samples_split': [2, 5, 10] } grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, scoring='accuracy', n_jobs=-1) grid_search.fit(X_train, y_train) print("Лучшие параметры:", grid_search.best_params_) best_model = grid_search.best_estimator_ y_pred_best = best_model.predict(X_test) evaluate_model("Best Random Forest", y_test, y_pred_best) # 7. Визуализация матрицы ошибок fig, ax = plt.subplots(figsize=(5, 5)) ConfusionMatrixDisplay.from_estimator(best_model, X_test, y_test, ax=ax, cmap='Blues') plt.show() # 8. Сохранение и загрузка модели joblib.dump(best_model, "best_model.pkl") loaded_model = joblib.load("best_model.pkl")