sivakarthik08 commited on
Commit
a553846
·
verified ·
1 Parent(s): 89f46e1

Upload 2 files

Browse files
Files changed (2) hide show
  1. predict_model.py +87 -0
  2. train_model.py +74 -0
predict_model.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import joblib
4
+ import matplotlib.pyplot as plt
5
+ import seaborn as sns
6
+
7
+ # Load saved model, encoder, and training columns
8
+ model = joblib.load('random_forest_model.pkl')
9
+ le = joblib.load('label_encoder.pkl')
10
+ training_columns = joblib.load('training_columns.pkl')
11
+
12
+ # Mapping helper
13
+ def map_and_prepare_input_data(input_df):
14
+ from difflib import get_close_matches
15
+
16
+ column_aliases = {
17
+ "App Tech Stack": ["app tech stack", "technology stack", "application stack"],
18
+ "Operating System": ["os", "operating system", "platform"],
19
+ "DB Details": ["db info", "database", "database information", "db"],
20
+ "Authentication Model": ["auth model", "authentication", "authentication type"],
21
+ "Application Components": ["components", "app components", "application parts"],
22
+ "Licence Renewal": ["license", "license renewal", "renewal"],
23
+ }
24
+
25
+ reverse_aliases = {}
26
+ for std_col, aliases in column_aliases.items():
27
+ for alias in aliases:
28
+ reverse_aliases[alias.lower()] = std_col
29
+
30
+ mapping = {}
31
+ for col in input_df.columns:
32
+ col_lower = col.lower()
33
+ if col_lower in reverse_aliases:
34
+ mapping[col] = reverse_aliases[col_lower]
35
+ else:
36
+ match = get_close_matches(col_lower, reverse_aliases.keys(), n=1, cutoff=0.8)
37
+ if match:
38
+ mapping[col] = reverse_aliases[match[0]]
39
+
40
+ input_df_renamed = input_df.rename(columns=mapping)
41
+ input_df_filtered = input_df_renamed[[col for col in input_df_renamed.columns if col in list(column_aliases.keys())]]
42
+
43
+ missing_columns = set(list(column_aliases.keys())) - set(input_df_filtered.columns)
44
+ if missing_columns:
45
+ raise ValueError(f"Missing required columns: {missing_columns}")
46
+
47
+ return input_df_filtered
48
+
49
+ # Load new input data
50
+ try:
51
+ new_data = pd.read_csv('input.csv')
52
+ except FileNotFoundError:
53
+ print("Error: 'input.csv' not found.")
54
+ exit()
55
+
56
+ new_data = map_and_prepare_input_data(new_data)
57
+ new_data.fillna('Unknown', inplace=True)
58
+
59
+ # One-hot encode and align with training columns
60
+ encoded_data = pd.get_dummies(new_data, columns=[
61
+ 'App Tech Stack', 'Operating System', 'DB Details',
62
+ 'Authentication Model', 'Application Components', 'Licence Renewal'
63
+ ])
64
+ encoded_data = encoded_data.reindex(columns=training_columns, fill_value=0)
65
+
66
+ # Predict
67
+ predicted_labels_encoded = model.predict(encoded_data)
68
+ predicted_labels = le.inverse_transform(predicted_labels_encoded)
69
+ new_data['Predicted Modernization Strategy'] = predicted_labels
70
+
71
+ # Save to CSV
72
+ new_data.to_csv('output.csv', index=False)
73
+ print("✅ Predictions saved to 'output.csv'")
74
+
75
+ # Visualize
76
+ counts = new_data['Predicted Modernization Strategy'].value_counts()
77
+ plt.figure(figsize=(10, 6))
78
+ counts.plot(kind='bar', color=['skyblue', 'lightgreen', 'salmon', 'plum', 'gold'])
79
+ plt.title('Distribution of Predicted Modernization Strategies')
80
+ plt.ylabel('Count')
81
+ plt.xticks(rotation=45, ha='right')
82
+ plt.tight_layout()
83
+ plt.show()
84
+
85
+ print("\n Count of Predicted Modernization Strategies:")
86
+ for strategy, count in counts.items():
87
+ print(f"{strategy}: {count}")
train_model.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import joblib
4
+ from sklearn.preprocessing import LabelEncoder
5
+ from sklearn.model_selection import train_test_split, RandomizedSearchCV
6
+ from sklearn.ensemble import RandomForestClassifier
7
+ from sklearn.linear_model import LogisticRegression
8
+ from sklearn.svm import SVC
9
+ from sklearn.ensemble import GradientBoostingClassifier
10
+ import xgboost as xgb
11
+ from scipy.stats import randint, uniform
12
+
13
+ # Load dataset
14
+ try:
15
+ df = pd.read_csv('Dataset.csv')
16
+ except FileNotFoundError:
17
+ print("Error: 'Dataset.csv' not found.")
18
+ exit()
19
+
20
+ # Fill missing values
21
+ df.fillna('Unknown', inplace=True)
22
+
23
+ # Encode categorical features
24
+ df_encoded = pd.get_dummies(df, columns=[
25
+ 'App Tech Stack', 'Operating System', 'DB Details',
26
+ 'Authentication Model', 'Application Components', 'Licence Renewal'
27
+ ], dummy_na=False)
28
+
29
+ # Encode target
30
+ le = LabelEncoder()
31
+ y_encoded = le.fit_transform(df_encoded['Modernization Strategy'])
32
+ X = df_encoded.drop(columns=['Modernization Strategy'])
33
+
34
+ # Train-validation-test split
35
+ X_train, X_temp, y_train, y_temp = train_test_split(X, y_encoded, test_size=0.2, stratify=y_encoded, random_state=42)
36
+ X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)
37
+
38
+ # Models
39
+ models = {
40
+ 'RandomForest': RandomForestClassifier(random_state=42),
41
+ 'LogisticRegression': LogisticRegression(random_state=42, max_iter=1000),
42
+ 'SVM': SVC(random_state=42),
43
+ 'GradientBoosting': GradientBoostingClassifier(random_state=42),
44
+ 'XGBoost': xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
45
+ }
46
+
47
+ # Hyperparameters
48
+ param_grids = {
49
+ 'RandomForest': {'n_estimators': randint(50, 200), 'max_depth': randint(10, 50),
50
+ 'min_samples_split': randint(2, 10), 'min_samples_leaf': randint(1, 5)},
51
+ 'LogisticRegression': {'C': uniform(0.1, 10)},
52
+ 'SVM': {'C': uniform(0.1, 10), 'kernel': ['linear', 'rbf', 'poly']},
53
+ 'GradientBoosting': {'n_estimators': randint(50, 200), 'learning_rate': uniform(0.01, 0.3),
54
+ 'max_depth': randint(3, 10)},
55
+ 'XGBoost': {'n_estimators': randint(50, 200), 'learning_rate': uniform(0.01, 0.3),
56
+ 'max_depth': randint(3, 10), 'subsample': uniform(0.5, 0.5), 'colsample_bytree': uniform(0.5, 0.5)}
57
+ }
58
+
59
+ # Randomized search and select best models
60
+ best_models = {}
61
+ for name in models:
62
+ print(f"Tuning {name}...")
63
+ search = RandomizedSearchCV(models[name], param_grids[name], n_iter=30, cv=5,
64
+ scoring='accuracy', n_jobs=-1, random_state=42)
65
+ search.fit(X_val, y_val)
66
+ best_models[name] = search.best_estimator_
67
+ print(f"Best score for {name}: {search.best_score_:.4f}")
68
+
69
+ # Save the best RandomForest model and encoder
70
+ joblib.dump(best_models['RandomForest'], 'random_forest_model.pkl')
71
+ joblib.dump(le, 'label_encoder.pkl')
72
+ joblib.dump(X.columns.tolist(), 'training_columns.pkl')
73
+
74
+ print("\n✅ Model and encoders saved successfully.")