|
|
import pandas as pd
|
|
|
import numpy as np
|
|
|
import joblib
|
|
|
from sklearn.preprocessing import LabelEncoder
|
|
|
from sklearn.model_selection import train_test_split, RandomizedSearchCV
|
|
|
from sklearn.ensemble import RandomForestClassifier
|
|
|
from sklearn.linear_model import LogisticRegression
|
|
|
from sklearn.svm import SVC
|
|
|
from sklearn.ensemble import GradientBoostingClassifier
|
|
|
import xgboost as xgb
|
|
|
from scipy.stats import randint, uniform
|
|
|
|
|
|
|
|
|
try:
|
|
|
df = pd.read_csv('Dataset.csv')
|
|
|
except FileNotFoundError:
|
|
|
print("Error: 'Dataset.csv' not found.")
|
|
|
exit()
|
|
|
|
|
|
|
|
|
df.fillna('Unknown', inplace=True)
|
|
|
|
|
|
|
|
|
df_encoded = pd.get_dummies(df, columns=[
|
|
|
'App Tech Stack', 'Operating System', 'DB Details',
|
|
|
'Authentication Model', 'Application Components', 'Licence Renewal'
|
|
|
], dummy_na=False)
|
|
|
|
|
|
|
|
|
le = LabelEncoder()
|
|
|
y_encoded = le.fit_transform(df_encoded['Modernization Strategy'])
|
|
|
X = df_encoded.drop(columns=['Modernization Strategy'])
|
|
|
|
|
|
|
|
|
X_train, X_temp, y_train, y_temp = train_test_split(X, y_encoded, test_size=0.2, stratify=y_encoded, random_state=42)
|
|
|
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)
|
|
|
|
|
|
|
|
|
models = {
|
|
|
'RandomForest': RandomForestClassifier(random_state=42),
|
|
|
'LogisticRegression': LogisticRegression(random_state=42, max_iter=1000),
|
|
|
'SVM': SVC(random_state=42),
|
|
|
'GradientBoosting': GradientBoostingClassifier(random_state=42),
|
|
|
'XGBoost': xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
|
|
|
}
|
|
|
|
|
|
|
|
|
param_grids = {
|
|
|
'RandomForest': {'n_estimators': randint(50, 200), 'max_depth': randint(10, 50),
|
|
|
'min_samples_split': randint(2, 10), 'min_samples_leaf': randint(1, 5)},
|
|
|
'LogisticRegression': {'C': uniform(0.1, 10)},
|
|
|
'SVM': {'C': uniform(0.1, 10), 'kernel': ['linear', 'rbf', 'poly']},
|
|
|
'GradientBoosting': {'n_estimators': randint(50, 200), 'learning_rate': uniform(0.01, 0.3),
|
|
|
'max_depth': randint(3, 10)},
|
|
|
'XGBoost': {'n_estimators': randint(50, 200), 'learning_rate': uniform(0.01, 0.3),
|
|
|
'max_depth': randint(3, 10), 'subsample': uniform(0.5, 0.5), 'colsample_bytree': uniform(0.5, 0.5)}
|
|
|
}
|
|
|
|
|
|
|
|
|
best_models = {}
|
|
|
for name in models:
|
|
|
print(f"Tuning {name}...")
|
|
|
search = RandomizedSearchCV(models[name], param_grids[name], n_iter=30, cv=5,
|
|
|
scoring='accuracy', n_jobs=-1, random_state=42)
|
|
|
search.fit(X_val, y_val)
|
|
|
best_models[name] = search.best_estimator_
|
|
|
print(f"Best score for {name}: {search.best_score_:.4f}")
|
|
|
|
|
|
|
|
|
joblib.dump(best_models['RandomForest'], 'random_forest_model.pkl')
|
|
|
joblib.dump(le, 'label_encoder.pkl')
|
|
|
joblib.dump(X.columns.tolist(), 'training_columns.pkl')
|
|
|
|
|
|
print("\n✅ Model and encoders saved successfully.")
|
|
|
|