File size: 5,099 Bytes
6abfac2
 
 
 
 
 
 
 
 
 
a603065
6abfac2
 
a603065
 
 
6abfac2
 
 
 
a603065
 
 
6abfac2
 
a603065
 
 
6abfac2
a603065
 
 
 
 
 
 
 
6abfac2
a603065
6abfac2
a603065
 
6abfac2
 
a603065
 
 
6abfac2
a603065
 
 
6abfac2
 
 
 
 
 
a603065
 
6abfac2
 
 
 
 
 
a603065
 
6abfac2
 
 
 
 
 
 
a603065
 
6abfac2
 
 
 
 
a603065
 
 
6abfac2
a603065
6abfac2
 
 
a603065
 
 
6abfac2
a603065
6abfac2
 
 
a603065
 
 
 
 
6abfac2
a603065
6abfac2
 
a603065
 
 
 
6abfac2
a603065
6abfac2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#Next Task -> Training the dataset
#In this file I've done Training and encoding the dataset
#Now as I've already done the EDA...the next task is to train and save the data for preprocessing

import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import joblib
import os 
import re 

# Changed to a relative path for better portability on deployment platforms
DATASET_PATH = "electricity_cost_dataset.csv.xlsx"
MODEL_OUTPUT_DIR = "."

os.makedirs(MODEL_OUTPUT_DIR, exist_ok=True)

def RenamingColumns(Column_Name):
 Column_Name = re.sub(r'\s+', '_', Column_Name)      
 Column_Name = re.sub(r'[^\w_]', '', Column_Name) 
 return Column_Name.lower()

try:
 df = pd.read_excel(DATASET_PATH)
 print("Original columns ->\n")
 print(df.columns.tolist())

 new_columns = []
 
 #As I've to rename the columns....I'm using a for loop to do this->
 #If, the column names given as an input in the FastAPI are not same as the column names in the dataset...an error will be occured on the fastAPI application
 
 for col in df.columns:
    new_col = RenamingColumns(col)
    new_columns.append(new_col)

 df.columns = new_columns

 print("Renamed Columns ->\n")
 print(df.columns.tolist())

except FileNotFoundError:
 print(f"Error: Dataset not found! Please ensure the file is in the same directory")
 exit()
 
except Exception as e:
 print(f"Error : {e}")
 exit()
 
#I used try and except blocks for ERROR HANDLING
#Now, all the names have been changed and I've converted same as the datset ones...Therefor from here, I've used new names

TARGET_COL = 'electricity_cost' 

if TARGET_COL not in df.columns:
 print(f"Error: Target column '{TARGET_COL}' not found!")
 exit()

features_df = df.drop(columns=[TARGET_COL]) 
#Using .drop, I removed the feature which will not be used in calculation
y = df[TARGET_COL]

NUMERICAL_FEATURES = [
 'site_area', 'water_consumption', 'recycling_rate', 'utilisation_rate',
 'air_qality_index', 'issue_reolution_time', 'resident_count'
]
CATEGORICAL_FEATURES = ['structure_type']

all_expected_features = NUMERICAL_FEATURES + CATEGORICAL_FEATURES
missing_features = [col for col in all_expected_features if col not in features_df.columns]

if missing_features:
 print(f"Error: The following expected features are missing from the data after renaming: {missing_features}")
 exit()
#The above steps were only for the safety purpose...to recheck if there is any missing features.
#Actually, I did it just because I was facing many errors...therefore just to check I added some checkpoints.

numerical_imputer = SimpleImputer(strategy='mean')
if NUMERICAL_FEATURES:
 features_df[NUMERICAL_FEATURES] = numerical_imputer.fit_transform(features_df[NUMERICAL_FEATURES])
 joblib.dump(numerical_imputer, os.path.join(MODEL_OUTPUT_DIR, 'numerical_imputer.pkl'))
 print("Numerical imputer fitted and saved")
else:
 print("No numerical columns to impute")

categorical_imputer = SimpleImputer(strategy='most_frequent')
if CATEGORICAL_FEATURES:
 features_df[CATEGORICAL_FEATURES] = categorical_imputer.fit_transform(features_df[CATEGORICAL_FEATURES])
 joblib.dump(categorical_imputer, os.path.join(MODEL_OUTPUT_DIR, 'categorical_imputer.pkl'))
 print("Categorical imputer fitted and saved")
else:
 print("No categorical columns to impute")
#I used joblib because I wanted to use this data later as well...therefore, whenever I will be in need of it I will load this with joblib.load()

if 'structure_type' in features_df.columns:
 features_df['structure_type'] = features_df['structure_type'].astype(str).str.lower().str.strip()
 le_structure_type = LabelEncoder()
 features_df['structure_type'] = le_structure_type.fit_transform(features_df['structure_type'])
 joblib.dump(le_structure_type, os.path.join(MODEL_OUTPUT_DIR, 'label_encoder_structure_type.pkl'))
 print("LabelEncoder for 'structure_type' fitted and saved.")
else:
 print("structure_type column not found or not categorical, skipping LabelEncoder.")

if NUMERICAL_FEATURES:
 scaler = StandardScaler()
 features_df[NUMERICAL_FEATURES] = scaler.fit_transform(features_df[NUMERICAL_FEATURES])
 joblib.dump(scaler, os.path.join(MODEL_OUTPUT_DIR, 'scaler.pkl'))
 print("StandardScaler fitted and saved.")
else:
 print("No numerical columns to scale.")

#You can see that, I've used joblib.dump to create a separate directory for each imputer and encoder made

X = features_df
y = df[TARGET_COL]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression() 
model.fit(X_train, y_train)
joblib.dump(model, os.path.join(MODEL_OUTPUT_DIR, 'model.pkl'))

FINAL_MODEL_EXPECTED_FEATURES = X_train.columns.tolist()
print("All expected features from Final Model->\n")
print(FINAL_MODEL_EXPECTED_FEATURES)

#So, now, all necessary .pkl files created and saved in the current directory