#Now, comes another important task that is Preprocessing the data import pandas as pd import joblib import os from sklearn.impute import SimpleImputer from sklearn.preprocessing import LabelEncoder, StandardScaler import requests GITHUB_BASE_URL = "https://raw.githubusercontent.com/jainkavya738/Electricity-ML-Artifacts/main/" NUM_IMPUTER_URL = "https://github.com/jainkavya738/Electricity-ML-Artifacts/raw/refs/heads/main/numerical_imputer.pkl" CAT_IMPUTER_URL = "https://github.com/jainkavya738/Electricity-ML-Artifacts/raw/refs/heads/main/categorical_imputer.pkl" LE_STRUCTURE_TYPE_URL = "https://github.com/jainkavya738/Electricity-ML-Artifacts/raw/refs/heads/main/label_encoder_structure_type.pkl" SCALER_URL = "https://github.com/jainkavya738/Electricity-ML-Artifacts/raw/refs/heads/main/scaler.pkl" MODEL_URL = "https://github.com/jainkavya738/Electricity-ML-Artifacts/raw/refs/heads/main/model.pkl" def download_and_load_pkl(url, filename): print(f"Attempting to download {filename} from {url}") temp_filepath = None try: response = requests.get(url, stream=True) response.raise_for_status() temp_filepath = os.path.join('/tmp', filename) with open(temp_filepath, 'wb') as f: for chunk in response.iter_content(chunk_size=8192): f.write(chunk) obj = joblib.load(temp_filepath) print(f"Successfully downloaded and loaded {filename}") os.remove(temp_filepath) return obj except requests.exceptions.RequestException as e: print(f"Error downloading {temp_filepath} from {url}: {e}") return None except Exception as e: print(f"Error loading {temp_filepath} after download: {e}") return None finally: if temp_filepath and os.path.exists(temp_filepath): try: os.remove(temp_filepath) print(f"Cleaned up temporary file: {temp_filepath}") except OSError as e: print(f"Warning: Could not remove temporary file {temp_filepath}: {e}") numerical_imputer = download_and_load_pkl(NUM_IMPUTER_URL, "numerical_imputer.pkl") categorical_imputer = download_and_load_pkl(CAT_IMPUTER_URL, "categorical_imputer.pkl") le_structure_type = download_and_load_pkl(LE_STRUCTURE_TYPE_URL, "label_encoder_structure_type.pkl") scaler = download_and_load_pkl(SCALER_URL, "scaler.pkl") model = download_and_load_pkl(MODEL_URL, "model.pkl") #I have done this to set them as a placeholder in this file....therefore no discrepancies related to it will occur #You can see that I've used the try and except model for loading the data so that if error occurs I'm completely aware of it NUMERICAL_FEATURES = [ 'site_area', 'water_consumption', 'recycling_rate', 'utilisation_rate', 'air_qality_index', 'issue_reolution_time', 'resident_count' ] CATEGORICAL_FEATURES = ['structure_type'] FINAL_MODEL_EXPECTED_FEATURES = [ 'site_area', 'water_consumption', 'recycling_rate', 'utilisation_rate', 'air_qality_index', 'issue_reolution_time', 'resident_count', 'structure_type' ] #Final model expected features contains the list of the final output of the trained data #Now, our input will most likely be a dictionary...but for MLOps we would be needing a Pandas datframe so I converted this input dictionary into a dataframe and then returned it to my function after performing operation -> def preprocess_input(input_data: dict) -> pd.DataFrame: df_processed = pd.DataFrame([input_data]) print(f"DataFrame after initial creation (df_processed)-> \n{df_processed}") if 'structure_type' in df_processed.columns: df_processed['structure_type'] = df_processed['structure_type'].astype(str).str.lower().str.strip() print(f"'structure_type' standardized to: '{df_processed['structure_type'].iloc[0]}'") if numerical_imputer is not None and NUMERICAL_FEATURES: missing_input = [col for col in NUMERICAL_FEATURES if col not in df_processed.columns] if missing_input: raise ValueError(f"Error : Numerical features {missing_input} are missing from input DataFrame!") #This is only to verify...It will give us the missing columns which should  be present while doing numerical imputation....basically, I'm trying to handle all the errors possible try: df_processed[NUMERICAL_FEATURES] = numerical_imputer.transform(df_processed[NUMERICAL_FEATURES]) except Exception as e: raise RuntimeError( f"Error during numerical imputation\n" f"Error : {e}" ) #raise functions are best here because as soon as the error occurs....it will stop the function if categorical_imputer is not None and CATEGORICAL_FEATURES: missing_input = [col for col in CATEGORICAL_FEATURES if col not in df_processed.columns] if missing_input: raise ValueError(f"Error : Categorical features {missing_input} are missing from input DataFrame!") try: df_processed[CATEGORICAL_FEATURES] = categorical_imputer.transform(df_processed[CATEGORICAL_FEATURES]) except Exception as e: raise RuntimeError( f"Error during categorical imputation\n" f"Error : {e}" ) if le_structure_type is not None and 'structure_type' in df_processed.columns: try: df_processed['structure_type'] = le_structure_type.transform(df_processed['structure_type']) except ValueError as e: raise ValueError( f"Unknown category -> '{df_processed['structure_type'].iloc[0]}' in column 'structure_type'\n" f"Error : {e}" ) except Exception as e: raise RuntimeError(f"Error during Label Encoding for 'structure_type'...Error: {e}") if scaler is not None and NUMERICAL_FEATURES: missing_input = [col for col in NUMERICAL_FEATURES if col not in df_processed.columns] if missing_input: raise ValueError(f"Error : Numerical features {missing_input} are missing from input DataFrame") try: df_processed[NUMERICAL_FEATURES] = scaler.transform(df_processed[NUMERICAL_FEATURES]) except Exception as e: raise RuntimeError( f"Error during scaling\n" f"Error: {e}" ) print(f"Current df_processed columns before final reorder: {df_processed.columns.tolist()}") #Checkpoint for col in FINAL_MODEL_EXPECTED_FEATURES: if col not in df_processed.columns: print(f"Adding missing column: '{col}' with value 0.") df_processed[col] = 0 df_final = df_processed[FINAL_MODEL_EXPECTED_FEATURES] print(f"Final DataFrame for prediction: \n{df_final}") return df_final #The function I created above was based upon the numerical and categorical imputation, label encoding, scaling or basically all the data preprocessing that should be done after training all the models..... #I have show all the error messages in my coding lines because I got stuck in this process many time and to highlight the mistakes I have created some checkpoints also in between....Therefore, now all the data operations are done and the next thing is DEPLOYMENT-> creation of FastAPI and deployment on AWS etc.