zefirkash / Klas.py
Zefirkash's picture
Create Klas.py
81084b5 verified
# 1. Установка и импорт библиотек
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
# 2. Загрузка и первичный анализ данных
df = pd.read_csv("dataset.csv")
print(df.head(), df.info(), df.isnull().sum(), df.describe())
# 3. Предобработка данных
df.fillna(df.mean(), inplace=True)
df.fillna(df.mode().iloc[0], inplace=True)
le = LabelEncoder()
df['target'] = le.fit_transform(df['target'])
df = pd.get_dummies(df, drop_first=True)
X = df.drop(columns=['target'])
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# 4. Обучение моделей
models = {
"Logistic Regression": LogisticRegression(),
"Decision Tree": DecisionTreeClassifier(),
"Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
"Gradient Boosting": GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
}
predictions = {}
for name, model in models.items():
model.fit(X_train, y_train)
predictions[name] = model.predict(X_test)
# 5. Оценка качества моделей
def evaluate_model(name, y_true, y_pred):
print(f"=== {name} ===")
print("Accuracy:", accuracy_score(y_true, y_pred))
print("Classification Report:\n", classification_report(y_true, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))
print("\n")
for name, y_pred in predictions.items():
evaluate_model(name, y_test, y_pred)
# 6. Подбор гиперпараметров для случайного леса
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [None, 10, 20],
'min_samples_split': [2, 5, 10]
}
grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)
print("Лучшие параметры:", grid_search.best_params_)
best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test)
evaluate_model("Best Random Forest", y_test, y_pred_best)
# 7. Визуализация матрицы ошибок
fig, ax = plt.subplots(figsize=(5, 5))
ConfusionMatrixDisplay.from_estimator(best_model, X_test, y_test, ax=ax, cmap='Blues')
plt.show()
# 8. Сохранение и загрузка модели
joblib.dump(best_model, "best_model.pkl")
loaded_model = joblib.load("best_model.pkl")