Electricity-Cost-Predictor-FastAPI / train_and_save_model.py
Kavya-Jain's picture
Upload 7 files
a603065 verified
#Next Task -> Training the dataset
#In this file I've done Training and encoding the dataset
#Now as I've already done the EDA...the next task is to train and save the data for preprocessing
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import joblib
import os
import re
# Changed to a relative path for better portability on deployment platforms
DATASET_PATH = "electricity_cost_dataset.csv.xlsx"
MODEL_OUTPUT_DIR = "."
os.makedirs(MODEL_OUTPUT_DIR, exist_ok=True)
def RenamingColumns(Column_Name):
Column_Name = re.sub(r'\s+', '_', Column_Name)
Column_Name = re.sub(r'[^\w_]', '', Column_Name)
return Column_Name.lower()
try:
df = pd.read_excel(DATASET_PATH)
print("Original columns ->\n")
print(df.columns.tolist())
new_columns = []
#As I've to rename the columns....I'm using a for loop to do this->
#If, the column names given as an input in the FastAPI are not same as the column names in the dataset...an error will be occured on the fastAPI application
for col in df.columns:
new_col = RenamingColumns(col)
new_columns.append(new_col)
df.columns = new_columns
print("Renamed Columns ->\n")
print(df.columns.tolist())
except FileNotFoundError:
print(f"Error: Dataset not found! Please ensure the file is in the same directory")
exit()
except Exception as e:
print(f"Error : {e}")
exit()
#I used try and except blocks for ERROR HANDLING
#Now, all the names have been changed and I've converted same as the datset ones...Therefor from here, I've used new names
TARGET_COL = 'electricity_cost'
if TARGET_COL not in df.columns:
print(f"Error: Target column '{TARGET_COL}' not found!")
exit()
features_df = df.drop(columns=[TARGET_COL])
#Using .drop, I removed the feature which will not be used in calculation
y = df[TARGET_COL]
NUMERICAL_FEATURES = [
'site_area', 'water_consumption', 'recycling_rate', 'utilisation_rate',
'air_qality_index', 'issue_reolution_time', 'resident_count'
]
CATEGORICAL_FEATURES = ['structure_type']
all_expected_features = NUMERICAL_FEATURES + CATEGORICAL_FEATURES
missing_features = [col for col in all_expected_features if col not in features_df.columns]
if missing_features:
print(f"Error: The following expected features are missing from the data after renaming: {missing_features}")
exit()
#The above steps were only for the safety purpose...to recheck if there is any missing features.
#Actually, I did it just because I was facing many errors...therefore just to check I added some checkpoints.
numerical_imputer = SimpleImputer(strategy='mean')
if NUMERICAL_FEATURES:
features_df[NUMERICAL_FEATURES] = numerical_imputer.fit_transform(features_df[NUMERICAL_FEATURES])
joblib.dump(numerical_imputer, os.path.join(MODEL_OUTPUT_DIR, 'numerical_imputer.pkl'))
print("Numerical imputer fitted and saved")
else:
print("No numerical columns to impute")
categorical_imputer = SimpleImputer(strategy='most_frequent')
if CATEGORICAL_FEATURES:
features_df[CATEGORICAL_FEATURES] = categorical_imputer.fit_transform(features_df[CATEGORICAL_FEATURES])
joblib.dump(categorical_imputer, os.path.join(MODEL_OUTPUT_DIR, 'categorical_imputer.pkl'))
print("Categorical imputer fitted and saved")
else:
print("No categorical columns to impute")
#I used joblib because I wanted to use this data later as well...therefore, whenever I will be in need of it I will load this with joblib.load()
if 'structure_type' in features_df.columns:
features_df['structure_type'] = features_df['structure_type'].astype(str).str.lower().str.strip()
le_structure_type = LabelEncoder()
features_df['structure_type'] = le_structure_type.fit_transform(features_df['structure_type'])
joblib.dump(le_structure_type, os.path.join(MODEL_OUTPUT_DIR, 'label_encoder_structure_type.pkl'))
print("LabelEncoder for 'structure_type' fitted and saved.")
else:
print("structure_type column not found or not categorical, skipping LabelEncoder.")
if NUMERICAL_FEATURES:
scaler = StandardScaler()
features_df[NUMERICAL_FEATURES] = scaler.fit_transform(features_df[NUMERICAL_FEATURES])
joblib.dump(scaler, os.path.join(MODEL_OUTPUT_DIR, 'scaler.pkl'))
print("StandardScaler fitted and saved.")
else:
print("No numerical columns to scale.")
#You can see that, I've used joblib.dump to create a separate directory for each imputer and encoder made
X = features_df
y = df[TARGET_COL]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)
joblib.dump(model, os.path.join(MODEL_OUTPUT_DIR, 'model.pkl'))
FINAL_MODEL_EXPECTED_FEATURES = X_train.columns.tolist()
print("All expected features from Final Model->\n")
print(FINAL_MODEL_EXPECTED_FEATURES)
#So, now, all necessary .pkl files created and saved in the current directory