Upload 2 files
Browse files- predict_model.py +87 -0
- train_model.py +74 -0
predict_model.py
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import numpy as np
|
| 3 |
+
import joblib
|
| 4 |
+
import matplotlib.pyplot as plt
|
| 5 |
+
import seaborn as sns
|
| 6 |
+
|
| 7 |
+
# Load saved model, encoder, and training columns
|
| 8 |
+
model = joblib.load('random_forest_model.pkl')
|
| 9 |
+
le = joblib.load('label_encoder.pkl')
|
| 10 |
+
training_columns = joblib.load('training_columns.pkl')
|
| 11 |
+
|
| 12 |
+
# Mapping helper
|
| 13 |
+
def map_and_prepare_input_data(input_df):
|
| 14 |
+
from difflib import get_close_matches
|
| 15 |
+
|
| 16 |
+
column_aliases = {
|
| 17 |
+
"App Tech Stack": ["app tech stack", "technology stack", "application stack"],
|
| 18 |
+
"Operating System": ["os", "operating system", "platform"],
|
| 19 |
+
"DB Details": ["db info", "database", "database information", "db"],
|
| 20 |
+
"Authentication Model": ["auth model", "authentication", "authentication type"],
|
| 21 |
+
"Application Components": ["components", "app components", "application parts"],
|
| 22 |
+
"Licence Renewal": ["license", "license renewal", "renewal"],
|
| 23 |
+
}
|
| 24 |
+
|
| 25 |
+
reverse_aliases = {}
|
| 26 |
+
for std_col, aliases in column_aliases.items():
|
| 27 |
+
for alias in aliases:
|
| 28 |
+
reverse_aliases[alias.lower()] = std_col
|
| 29 |
+
|
| 30 |
+
mapping = {}
|
| 31 |
+
for col in input_df.columns:
|
| 32 |
+
col_lower = col.lower()
|
| 33 |
+
if col_lower in reverse_aliases:
|
| 34 |
+
mapping[col] = reverse_aliases[col_lower]
|
| 35 |
+
else:
|
| 36 |
+
match = get_close_matches(col_lower, reverse_aliases.keys(), n=1, cutoff=0.8)
|
| 37 |
+
if match:
|
| 38 |
+
mapping[col] = reverse_aliases[match[0]]
|
| 39 |
+
|
| 40 |
+
input_df_renamed = input_df.rename(columns=mapping)
|
| 41 |
+
input_df_filtered = input_df_renamed[[col for col in input_df_renamed.columns if col in list(column_aliases.keys())]]
|
| 42 |
+
|
| 43 |
+
missing_columns = set(list(column_aliases.keys())) - set(input_df_filtered.columns)
|
| 44 |
+
if missing_columns:
|
| 45 |
+
raise ValueError(f"Missing required columns: {missing_columns}")
|
| 46 |
+
|
| 47 |
+
return input_df_filtered
|
| 48 |
+
|
| 49 |
+
# Load new input data
|
| 50 |
+
try:
|
| 51 |
+
new_data = pd.read_csv('input.csv')
|
| 52 |
+
except FileNotFoundError:
|
| 53 |
+
print("Error: 'input.csv' not found.")
|
| 54 |
+
exit()
|
| 55 |
+
|
| 56 |
+
new_data = map_and_prepare_input_data(new_data)
|
| 57 |
+
new_data.fillna('Unknown', inplace=True)
|
| 58 |
+
|
| 59 |
+
# One-hot encode and align with training columns
|
| 60 |
+
encoded_data = pd.get_dummies(new_data, columns=[
|
| 61 |
+
'App Tech Stack', 'Operating System', 'DB Details',
|
| 62 |
+
'Authentication Model', 'Application Components', 'Licence Renewal'
|
| 63 |
+
])
|
| 64 |
+
encoded_data = encoded_data.reindex(columns=training_columns, fill_value=0)
|
| 65 |
+
|
| 66 |
+
# Predict
|
| 67 |
+
predicted_labels_encoded = model.predict(encoded_data)
|
| 68 |
+
predicted_labels = le.inverse_transform(predicted_labels_encoded)
|
| 69 |
+
new_data['Predicted Modernization Strategy'] = predicted_labels
|
| 70 |
+
|
| 71 |
+
# Save to CSV
|
| 72 |
+
new_data.to_csv('output.csv', index=False)
|
| 73 |
+
print("✅ Predictions saved to 'output.csv'")
|
| 74 |
+
|
| 75 |
+
# Visualize
|
| 76 |
+
counts = new_data['Predicted Modernization Strategy'].value_counts()
|
| 77 |
+
plt.figure(figsize=(10, 6))
|
| 78 |
+
counts.plot(kind='bar', color=['skyblue', 'lightgreen', 'salmon', 'plum', 'gold'])
|
| 79 |
+
plt.title('Distribution of Predicted Modernization Strategies')
|
| 80 |
+
plt.ylabel('Count')
|
| 81 |
+
plt.xticks(rotation=45, ha='right')
|
| 82 |
+
plt.tight_layout()
|
| 83 |
+
plt.show()
|
| 84 |
+
|
| 85 |
+
print("\n Count of Predicted Modernization Strategies:")
|
| 86 |
+
for strategy, count in counts.items():
|
| 87 |
+
print(f"{strategy}: {count}")
|
train_model.py
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import numpy as np
|
| 3 |
+
import joblib
|
| 4 |
+
from sklearn.preprocessing import LabelEncoder
|
| 5 |
+
from sklearn.model_selection import train_test_split, RandomizedSearchCV
|
| 6 |
+
from sklearn.ensemble import RandomForestClassifier
|
| 7 |
+
from sklearn.linear_model import LogisticRegression
|
| 8 |
+
from sklearn.svm import SVC
|
| 9 |
+
from sklearn.ensemble import GradientBoostingClassifier
|
| 10 |
+
import xgboost as xgb
|
| 11 |
+
from scipy.stats import randint, uniform
|
| 12 |
+
|
| 13 |
+
# Load dataset
|
| 14 |
+
try:
|
| 15 |
+
df = pd.read_csv('Dataset.csv')
|
| 16 |
+
except FileNotFoundError:
|
| 17 |
+
print("Error: 'Dataset.csv' not found.")
|
| 18 |
+
exit()
|
| 19 |
+
|
| 20 |
+
# Fill missing values
|
| 21 |
+
df.fillna('Unknown', inplace=True)
|
| 22 |
+
|
| 23 |
+
# Encode categorical features
|
| 24 |
+
df_encoded = pd.get_dummies(df, columns=[
|
| 25 |
+
'App Tech Stack', 'Operating System', 'DB Details',
|
| 26 |
+
'Authentication Model', 'Application Components', 'Licence Renewal'
|
| 27 |
+
], dummy_na=False)
|
| 28 |
+
|
| 29 |
+
# Encode target
|
| 30 |
+
le = LabelEncoder()
|
| 31 |
+
y_encoded = le.fit_transform(df_encoded['Modernization Strategy'])
|
| 32 |
+
X = df_encoded.drop(columns=['Modernization Strategy'])
|
| 33 |
+
|
| 34 |
+
# Train-validation-test split
|
| 35 |
+
X_train, X_temp, y_train, y_temp = train_test_split(X, y_encoded, test_size=0.2, stratify=y_encoded, random_state=42)
|
| 36 |
+
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)
|
| 37 |
+
|
| 38 |
+
# Models
|
| 39 |
+
models = {
|
| 40 |
+
'RandomForest': RandomForestClassifier(random_state=42),
|
| 41 |
+
'LogisticRegression': LogisticRegression(random_state=42, max_iter=1000),
|
| 42 |
+
'SVM': SVC(random_state=42),
|
| 43 |
+
'GradientBoosting': GradientBoostingClassifier(random_state=42),
|
| 44 |
+
'XGBoost': xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
|
| 45 |
+
}
|
| 46 |
+
|
| 47 |
+
# Hyperparameters
|
| 48 |
+
param_grids = {
|
| 49 |
+
'RandomForest': {'n_estimators': randint(50, 200), 'max_depth': randint(10, 50),
|
| 50 |
+
'min_samples_split': randint(2, 10), 'min_samples_leaf': randint(1, 5)},
|
| 51 |
+
'LogisticRegression': {'C': uniform(0.1, 10)},
|
| 52 |
+
'SVM': {'C': uniform(0.1, 10), 'kernel': ['linear', 'rbf', 'poly']},
|
| 53 |
+
'GradientBoosting': {'n_estimators': randint(50, 200), 'learning_rate': uniform(0.01, 0.3),
|
| 54 |
+
'max_depth': randint(3, 10)},
|
| 55 |
+
'XGBoost': {'n_estimators': randint(50, 200), 'learning_rate': uniform(0.01, 0.3),
|
| 56 |
+
'max_depth': randint(3, 10), 'subsample': uniform(0.5, 0.5), 'colsample_bytree': uniform(0.5, 0.5)}
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
# Randomized search and select best models
|
| 60 |
+
best_models = {}
|
| 61 |
+
for name in models:
|
| 62 |
+
print(f"Tuning {name}...")
|
| 63 |
+
search = RandomizedSearchCV(models[name], param_grids[name], n_iter=30, cv=5,
|
| 64 |
+
scoring='accuracy', n_jobs=-1, random_state=42)
|
| 65 |
+
search.fit(X_val, y_val)
|
| 66 |
+
best_models[name] = search.best_estimator_
|
| 67 |
+
print(f"Best score for {name}: {search.best_score_:.4f}")
|
| 68 |
+
|
| 69 |
+
# Save the best RandomForest model and encoder
|
| 70 |
+
joblib.dump(best_models['RandomForest'], 'random_forest_model.pkl')
|
| 71 |
+
joblib.dump(le, 'label_encoder.pkl')
|
| 72 |
+
joblib.dump(X.columns.tolist(), 'training_columns.pkl')
|
| 73 |
+
|
| 74 |
+
print("\n✅ Model and encoders saved successfully.")
|