Spaces:

Kavya-Jain
/

Electricity-Cost-Predictor-FastAPI

Runtime error

App Files Files Community

Electricity-Cost-Predictor-FastAPI / Preprocessing.py

Kavya-Jain

Upload Preprocessing.py

ad2e526 verified 6 months ago

raw

history blame contribute delete

7.53 kB

	#Now, comes another important task that is Preprocessing the data
	import pandas as pd
	import joblib
	import os
	from sklearn.impute import SimpleImputer
	from sklearn.preprocessing import LabelEncoder, StandardScaler
	import requests

	GITHUB_BASE_URL = "https://raw.githubusercontent.com/jainkavya738/Electricity-ML-Artifacts/main/"

	NUM_IMPUTER_URL = "https://github.com/jainkavya738/Electricity-ML-Artifacts/raw/refs/heads/main/numerical_imputer.pkl"
	CAT_IMPUTER_URL = "https://github.com/jainkavya738/Electricity-ML-Artifacts/raw/refs/heads/main/categorical_imputer.pkl"
	LE_STRUCTURE_TYPE_URL = "https://github.com/jainkavya738/Electricity-ML-Artifacts/raw/refs/heads/main/label_encoder_structure_type.pkl"
	SCALER_URL = "https://github.com/jainkavya738/Electricity-ML-Artifacts/raw/refs/heads/main/scaler.pkl"
	MODEL_URL = "https://github.com/jainkavya738/Electricity-ML-Artifacts/raw/refs/heads/main/model.pkl"

	def download_and_load_pkl(url, filename):
	print(f"Attempting to download {filename} from {url}")

	temp_filepath = None

	try:
	response = requests.get(url, stream=True)
	response.raise_for_status()

	temp_filepath = os.path.join('/tmp', filename)

	with open(temp_filepath, 'wb') as f:
	for chunk in response.iter_content(chunk_size=8192):
	f.write(chunk)

	obj = joblib.load(temp_filepath)
	print(f"Successfully downloaded and loaded {filename}")
	os.remove(temp_filepath)
	return obj

	except requests.exceptions.RequestException as e:
	print(f"Error downloading {temp_filepath} from {url}: {e}")
	return None

	except Exception as e:
	print(f"Error loading {temp_filepath} after download: {e}")
	return None

	finally:
	if temp_filepath and os.path.exists(temp_filepath):
	try:
	os.remove(temp_filepath)
	print(f"Cleaned up temporary file: {temp_filepath}")
	except OSError as e:
	print(f"Warning: Could not remove temporary file {temp_filepath}: {e}")

	numerical_imputer = download_and_load_pkl(NUM_IMPUTER_URL, "numerical_imputer.pkl")
	categorical_imputer = download_and_load_pkl(CAT_IMPUTER_URL, "categorical_imputer.pkl")
	le_structure_type = download_and_load_pkl(LE_STRUCTURE_TYPE_URL, "label_encoder_structure_type.pkl")
	scaler = download_and_load_pkl(SCALER_URL, "scaler.pkl")
	model = download_and_load_pkl(MODEL_URL, "model.pkl")

	#I have done this to set them as a placeholder in this file....therefore no discrepancies related to it will occur

	#You can see that I've used the try and except model for loading the data so that if error occurs I'm completely aware of it

	NUMERICAL_FEATURES = [
	'site_area', 'water_consumption', 'recycling_rate', 'utilisation_rate', 'air_qality_index', 'issue_reolution_time', 'resident_count'
	]
	CATEGORICAL_FEATURES = ['structure_type']

	FINAL_MODEL_EXPECTED_FEATURES = [
	'site_area', 'water_consumption', 'recycling_rate', 'utilisation_rate',
	'air_qality_index', 'issue_reolution_time', 'resident_count', 'structure_type'
	]
	#Final model expected features contains the list of the final output of the trained data

	#Now, our input will most likely be a dictionary...but for MLOps we would be needing a Pandas datframe so I converted this input dictionary into a dataframe and then returned it to my function after performing operation ->

	def preprocess_input(input_data: dict) -> pd.DataFrame:
	df_processed = pd.DataFrame([input_data])
	print(f"DataFrame after initial creation (df_processed)-> \n{df_processed}")

	if 'structure_type' in df_processed.columns:
	df_processed['structure_type'] = df_processed['structure_type'].astype(str).str.lower().str.strip()
	print(f"'structure_type' standardized to: '{df_processed['structure_type'].iloc[0]}'")

	if numerical_imputer is not None and NUMERICAL_FEATURES:
	missing_input = [col for col in NUMERICAL_FEATURES if col not in df_processed.columns]

	if missing_input:
	raise ValueError(f"Error : Numerical features {missing_input} are missing from input DataFrame!")
	#This is only to verify...It will give us the missing columns which should be present while doing numerical imputation....basically, I'm trying to handle all the errors possible

	try:
	df_processed[NUMERICAL_FEATURES] = numerical_imputer.transform(df_processed[NUMERICAL_FEATURES])

	except Exception as e:
	raise RuntimeError(
	f"Error during numerical imputation\n"
	f"Error : {e}"
	)
	#raise functions are best here because as soon as the error occurs....it will stop the function

	if categorical_imputer is not None and CATEGORICAL_FEATURES:
	missing_input = [col for col in CATEGORICAL_FEATURES if col not in df_processed.columns]

	if missing_input:
	raise ValueError(f"Error : Categorical features {missing_input} are missing from input DataFrame!")
	try:
	df_processed[CATEGORICAL_FEATURES] = categorical_imputer.transform(df_processed[CATEGORICAL_FEATURES])

	except Exception as e:
	raise RuntimeError(
	f"Error during categorical imputation\n"
	f"Error : {e}"
	)

	if le_structure_type is not None and 'structure_type' in df_processed.columns:
	try:
	df_processed['structure_type'] = le_structure_type.transform(df_processed['structure_type'])
	except ValueError as e:
	raise ValueError(
	f"Unknown category -> '{df_processed['structure_type'].iloc[0]}' in column 'structure_type'\n"
	f"Error : {e}"
	)
	except Exception as e:
	raise RuntimeError(f"Error during Label Encoding for 'structure_type'...Error: {e}")

	if scaler is not None and NUMERICAL_FEATURES:
	missing_input = [col for col in NUMERICAL_FEATURES if col not in df_processed.columns]

	if missing_input:
	raise ValueError(f"Error : Numerical features {missing_input} are missing from input DataFrame")

	try:
	df_processed[NUMERICAL_FEATURES] = scaler.transform(df_processed[NUMERICAL_FEATURES])

	except Exception as e:
	raise RuntimeError(
	f"Error during scaling\n"
	f"Error: {e}"
	)

	print(f"Current df_processed columns before final reorder: {df_processed.columns.tolist()}")
	#Checkpoint

	for col in FINAL_MODEL_EXPECTED_FEATURES:
	if col not in df_processed.columns:
	print(f"Adding missing column: '{col}' with value 0.")
	df_processed[col] = 0

	df_final = df_processed[FINAL_MODEL_EXPECTED_FEATURES]
	print(f"Final DataFrame for prediction: \n{df_final}")

	return df_final

	#The function I created above was based upon the numerical and categorical imputation, label encoding, scaling or basically all the data preprocessing that should be done after training all the models.....
	#I have show all the error messages in my coding lines because I got stuck in this process many time and to highlight the mistakes I have created some checkpoints also in between....Therefore, now all the data operations are done and the next thing is DEPLOYMENT-> creation of FastAPI and deployment on AWS etc.