Kavya-Jain's picture
Upload Preprocessing.py
ad2e526 verified
#Now, comes another important task that is Preprocessing the data
import pandas as pd
import joblib
import os
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
import requests
GITHUB_BASE_URL = "https://raw.githubusercontent.com/jainkavya738/Electricity-ML-Artifacts/main/"
NUM_IMPUTER_URL = "https://github.com/jainkavya738/Electricity-ML-Artifacts/raw/refs/heads/main/numerical_imputer.pkl"
CAT_IMPUTER_URL = "https://github.com/jainkavya738/Electricity-ML-Artifacts/raw/refs/heads/main/categorical_imputer.pkl"
LE_STRUCTURE_TYPE_URL = "https://github.com/jainkavya738/Electricity-ML-Artifacts/raw/refs/heads/main/label_encoder_structure_type.pkl"
SCALER_URL = "https://github.com/jainkavya738/Electricity-ML-Artifacts/raw/refs/heads/main/scaler.pkl"
MODEL_URL = "https://github.com/jainkavya738/Electricity-ML-Artifacts/raw/refs/heads/main/model.pkl"
def download_and_load_pkl(url, filename):
print(f"Attempting to download {filename} from {url}")
temp_filepath = None
try:
response = requests.get(url, stream=True)
response.raise_for_status()
temp_filepath = os.path.join('/tmp', filename)
with open(temp_filepath, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
obj = joblib.load(temp_filepath)
print(f"Successfully downloaded and loaded {filename}")
os.remove(temp_filepath)
return obj
except requests.exceptions.RequestException as e:
print(f"Error downloading {temp_filepath} from {url}: {e}")
return None
except Exception as e:
print(f"Error loading {temp_filepath} after download: {e}")
return None
finally:
if temp_filepath and os.path.exists(temp_filepath):
try:
os.remove(temp_filepath)
print(f"Cleaned up temporary file: {temp_filepath}")
except OSError as e:
print(f"Warning: Could not remove temporary file {temp_filepath}: {e}")
numerical_imputer = download_and_load_pkl(NUM_IMPUTER_URL, "numerical_imputer.pkl")
categorical_imputer = download_and_load_pkl(CAT_IMPUTER_URL, "categorical_imputer.pkl")
le_structure_type = download_and_load_pkl(LE_STRUCTURE_TYPE_URL, "label_encoder_structure_type.pkl")
scaler = download_and_load_pkl(SCALER_URL, "scaler.pkl")
model = download_and_load_pkl(MODEL_URL, "model.pkl")
#I have done this to set them as a placeholder in this file....therefore no discrepancies related to it will occur
#You can see that I've used the try and except model for loading the data so that if error occurs I'm completely aware of it
NUMERICAL_FEATURES = [
'site_area', 'water_consumption', 'recycling_rate', 'utilisation_rate', 'air_qality_index', 'issue_reolution_time', 'resident_count'
]
CATEGORICAL_FEATURES = ['structure_type']
FINAL_MODEL_EXPECTED_FEATURES = [
'site_area', 'water_consumption', 'recycling_rate', 'utilisation_rate',
'air_qality_index', 'issue_reolution_time', 'resident_count', 'structure_type'
]
#Final model expected features contains the list of the final output of the trained data
#Now, our input will most likely be a dictionary...but for MLOps we would be needing a Pandas datframe so I converted this input dictionary into a dataframe and then returned it to my function after performing operation ->
def preprocess_input(input_data: dict) -> pd.DataFrame:
df_processed = pd.DataFrame([input_data])
print(f"DataFrame after initial creation (df_processed)-> \n{df_processed}")
if 'structure_type' in df_processed.columns:
df_processed['structure_type'] = df_processed['structure_type'].astype(str).str.lower().str.strip()
print(f"'structure_type' standardized to: '{df_processed['structure_type'].iloc[0]}'")
if numerical_imputer is not None and NUMERICAL_FEATURES:
missing_input = [col for col in NUMERICAL_FEATURES if col not in df_processed.columns]
if missing_input:
raise ValueError(f"Error : Numerical features {missing_input} are missing from input DataFrame!")
#This is only to verify...It will give us the missing columns which should  be present while doing numerical imputation....basically, I'm trying to handle all the errors possible
try:
df_processed[NUMERICAL_FEATURES] = numerical_imputer.transform(df_processed[NUMERICAL_FEATURES])
except Exception as e:
raise RuntimeError(
f"Error during numerical imputation\n"
f"Error : {e}"
)
#raise functions are best here because as soon as the error occurs....it will stop the function
if categorical_imputer is not None and CATEGORICAL_FEATURES:
missing_input = [col for col in CATEGORICAL_FEATURES if col not in df_processed.columns]
if missing_input:
raise ValueError(f"Error : Categorical features {missing_input} are missing from input DataFrame!")
try:
df_processed[CATEGORICAL_FEATURES] = categorical_imputer.transform(df_processed[CATEGORICAL_FEATURES])
except Exception as e:
raise RuntimeError(
f"Error during categorical imputation\n"
f"Error : {e}"
)
if le_structure_type is not None and 'structure_type' in df_processed.columns:
try:
df_processed['structure_type'] = le_structure_type.transform(df_processed['structure_type'])
except ValueError as e:
raise ValueError(
f"Unknown category -> '{df_processed['structure_type'].iloc[0]}' in column 'structure_type'\n"
f"Error : {e}"
)
except Exception as e:
raise RuntimeError(f"Error during Label Encoding for 'structure_type'...Error: {e}")
if scaler is not None and NUMERICAL_FEATURES:
missing_input = [col for col in NUMERICAL_FEATURES if col not in df_processed.columns]
if missing_input:
raise ValueError(f"Error : Numerical features {missing_input} are missing from input DataFrame")
try:
df_processed[NUMERICAL_FEATURES] = scaler.transform(df_processed[NUMERICAL_FEATURES])
except Exception as e:
raise RuntimeError(
f"Error during scaling\n"
f"Error: {e}"
)
print(f"Current df_processed columns before final reorder: {df_processed.columns.tolist()}")
#Checkpoint
for col in FINAL_MODEL_EXPECTED_FEATURES:
if col not in df_processed.columns:
print(f"Adding missing column: '{col}' with value 0.")
df_processed[col] = 0
df_final = df_processed[FINAL_MODEL_EXPECTED_FEATURES]
print(f"Final DataFrame for prediction: \n{df_final}")
return df_final
#The function I created above was based upon the numerical and categorical imputation, label encoding, scaling or basically all the data preprocessing that should be done after training all the models.....
#I have show all the error messages in my coding lines because I got stuck in this process many time and to highlight the mistakes I have created some checkpoints also in between....Therefore, now all the data operations are done and the next thing is DEPLOYMENT-> creation of FastAPI and deployment on AWS etc.