Spaces:

Kavya-Jain
/

Electricity-Cost-Predictor-FastAPI

Runtime error

App Files Files Community

Electricity-Cost-Predictor-FastAPI / train_and_save_model.py

Kavya-Jain

Upload 7 files

a603065 verified 6 months ago

raw

history blame contribute delete

5.1 kB

	#Next Task -> Training the dataset
	#In this file I've done Training and encoding the dataset
	#Now as I've already done the EDA...the next task is to train and save the data for preprocessing

	import pandas as pd
	from sklearn.impute import SimpleImputer
	from sklearn.preprocessing import LabelEncoder, StandardScaler
	from sklearn.model_selection import train_test_split
	from sklearn.linear_model import LinearRegression
	import joblib
	import os
	import re

	# Changed to a relative path for better portability on deployment platforms
	DATASET_PATH = "electricity_cost_dataset.csv.xlsx"
	MODEL_OUTPUT_DIR = "."

	os.makedirs(MODEL_OUTPUT_DIR, exist_ok=True)

	def RenamingColumns(Column_Name):
	Column_Name = re.sub(r'\s+', '_', Column_Name)
	Column_Name = re.sub(r'[^\w_]', '', Column_Name)
	return Column_Name.lower()

	try:
	df = pd.read_excel(DATASET_PATH)
	print("Original columns ->\n")
	print(df.columns.tolist())

	new_columns = []

	#As I've to rename the columns....I'm using a for loop to do this->
	#If, the column names given as an input in the FastAPI are not same as the column names in the dataset...an error will be occured on the fastAPI application

	for col in df.columns:
	new_col = RenamingColumns(col)
	new_columns.append(new_col)

	df.columns = new_columns

	print("Renamed Columns ->\n")
	print(df.columns.tolist())

	except FileNotFoundError:
	print(f"Error: Dataset not found! Please ensure the file is in the same directory")
	exit()

	except Exception as e:
	print(f"Error : {e}")
	exit()

	#I used try and except blocks for ERROR HANDLING
	#Now, all the names have been changed and I've converted same as the datset ones...Therefor from here, I've used new names

	TARGET_COL = 'electricity_cost'

	if TARGET_COL not in df.columns:
	print(f"Error: Target column '{TARGET_COL}' not found!")
	exit()

	features_df = df.drop(columns=[TARGET_COL])
	#Using .drop, I removed the feature which will not be used in calculation
	y = df[TARGET_COL]

	NUMERICAL_FEATURES = [
	'site_area', 'water_consumption', 'recycling_rate', 'utilisation_rate',
	'air_qality_index', 'issue_reolution_time', 'resident_count'
	]
	CATEGORICAL_FEATURES = ['structure_type']

	all_expected_features = NUMERICAL_FEATURES + CATEGORICAL_FEATURES
	missing_features = [col for col in all_expected_features if col not in features_df.columns]

	if missing_features:
	print(f"Error: The following expected features are missing from the data after renaming: {missing_features}")
	exit()
	#The above steps were only for the safety purpose...to recheck if there is any missing features.
	#Actually, I did it just because I was facing many errors...therefore just to check I added some checkpoints.

	numerical_imputer = SimpleImputer(strategy='mean')
	if NUMERICAL_FEATURES:
	features_df[NUMERICAL_FEATURES] = numerical_imputer.fit_transform(features_df[NUMERICAL_FEATURES])
	joblib.dump(numerical_imputer, os.path.join(MODEL_OUTPUT_DIR, 'numerical_imputer.pkl'))
	print("Numerical imputer fitted and saved")
	else:
	print("No numerical columns to impute")

	categorical_imputer = SimpleImputer(strategy='most_frequent')
	if CATEGORICAL_FEATURES:
	features_df[CATEGORICAL_FEATURES] = categorical_imputer.fit_transform(features_df[CATEGORICAL_FEATURES])
	joblib.dump(categorical_imputer, os.path.join(MODEL_OUTPUT_DIR, 'categorical_imputer.pkl'))
	print("Categorical imputer fitted and saved")
	else:
	print("No categorical columns to impute")
	#I used joblib because I wanted to use this data later as well...therefore, whenever I will be in need of it I will load this with joblib.load()

	if 'structure_type' in features_df.columns:
	features_df['structure_type'] = features_df['structure_type'].astype(str).str.lower().str.strip()
	le_structure_type = LabelEncoder()
	features_df['structure_type'] = le_structure_type.fit_transform(features_df['structure_type'])
	joblib.dump(le_structure_type, os.path.join(MODEL_OUTPUT_DIR, 'label_encoder_structure_type.pkl'))
	print("LabelEncoder for 'structure_type' fitted and saved.")
	else:
	print("structure_type column not found or not categorical, skipping LabelEncoder.")

	if NUMERICAL_FEATURES:
	scaler = StandardScaler()
	features_df[NUMERICAL_FEATURES] = scaler.fit_transform(features_df[NUMERICAL_FEATURES])
	joblib.dump(scaler, os.path.join(MODEL_OUTPUT_DIR, 'scaler.pkl'))
	print("StandardScaler fitted and saved.")
	else:
	print("No numerical columns to scale.")

	#You can see that, I've used joblib.dump to create a separate directory for each imputer and encoder made

	X = features_df
	y = df[TARGET_COL]

	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

	model = LinearRegression()
	model.fit(X_train, y_train)
	joblib.dump(model, os.path.join(MODEL_OUTPUT_DIR, 'model.pkl'))

	FINAL_MODEL_EXPECTED_FEATURES = X_train.columns.tolist()
	print("All expected features from Final Model->\n")
	print(FINAL_MODEL_EXPECTED_FEATURES)

	#So, now, all necessary .pkl files created and saved in the current directory