Spaces:
Runtime error
Runtime error
File size: 7,527 Bytes
6abfac2 a603065 6abfac2 a603065 6abfac2 a603065 ad2e526 a603065 ad2e526 a603065 203ea81 a603065 203ea81 a603065 203ea81 a603065 ad2e526 a603065 203ea81 a603065 ad2e526 a603065 203ea81 a603065 ad2e526 a603065 6abfac2 a603065 6abfac2 a603065 6abfac2 a603065 6abfac2 a603065 6abfac2 a603065 6abfac2 a603065 6abfac2 a603065 6abfac2 a603065 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 |
#Now, comes another important task that is Preprocessing the data
import pandas as pd
import joblib
import os
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
import requests
GITHUB_BASE_URL = "https://raw.githubusercontent.com/jainkavya738/Electricity-ML-Artifacts/main/"
NUM_IMPUTER_URL = "https://github.com/jainkavya738/Electricity-ML-Artifacts/raw/refs/heads/main/numerical_imputer.pkl"
CAT_IMPUTER_URL = "https://github.com/jainkavya738/Electricity-ML-Artifacts/raw/refs/heads/main/categorical_imputer.pkl"
LE_STRUCTURE_TYPE_URL = "https://github.com/jainkavya738/Electricity-ML-Artifacts/raw/refs/heads/main/label_encoder_structure_type.pkl"
SCALER_URL = "https://github.com/jainkavya738/Electricity-ML-Artifacts/raw/refs/heads/main/scaler.pkl"
MODEL_URL = "https://github.com/jainkavya738/Electricity-ML-Artifacts/raw/refs/heads/main/model.pkl"
def download_and_load_pkl(url, filename):
print(f"Attempting to download {filename} from {url}")
temp_filepath = None
try:
response = requests.get(url, stream=True)
response.raise_for_status()
temp_filepath = os.path.join('/tmp', filename)
with open(temp_filepath, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
obj = joblib.load(temp_filepath)
print(f"Successfully downloaded and loaded {filename}")
os.remove(temp_filepath)
return obj
except requests.exceptions.RequestException as e:
print(f"Error downloading {temp_filepath} from {url}: {e}")
return None
except Exception as e:
print(f"Error loading {temp_filepath} after download: {e}")
return None
finally:
if temp_filepath and os.path.exists(temp_filepath):
try:
os.remove(temp_filepath)
print(f"Cleaned up temporary file: {temp_filepath}")
except OSError as e:
print(f"Warning: Could not remove temporary file {temp_filepath}: {e}")
numerical_imputer = download_and_load_pkl(NUM_IMPUTER_URL, "numerical_imputer.pkl")
categorical_imputer = download_and_load_pkl(CAT_IMPUTER_URL, "categorical_imputer.pkl")
le_structure_type = download_and_load_pkl(LE_STRUCTURE_TYPE_URL, "label_encoder_structure_type.pkl")
scaler = download_and_load_pkl(SCALER_URL, "scaler.pkl")
model = download_and_load_pkl(MODEL_URL, "model.pkl")
#I have done this to set them as a placeholder in this file....therefore no discrepancies related to it will occur
#You can see that I've used the try and except model for loading the data so that if error occurs I'm completely aware of it
NUMERICAL_FEATURES = [
'site_area', 'water_consumption', 'recycling_rate', 'utilisation_rate', 'air_qality_index', 'issue_reolution_time', 'resident_count'
]
CATEGORICAL_FEATURES = ['structure_type']
FINAL_MODEL_EXPECTED_FEATURES = [
'site_area', 'water_consumption', 'recycling_rate', 'utilisation_rate',
'air_qality_index', 'issue_reolution_time', 'resident_count', 'structure_type'
]
#Final model expected features contains the list of the final output of the trained data
#Now, our input will most likely be a dictionary...but for MLOps we would be needing a Pandas datframe so I converted this input dictionary into a dataframe and then returned it to my function after performing operation ->
def preprocess_input(input_data: dict) -> pd.DataFrame:
df_processed = pd.DataFrame([input_data])
print(f"DataFrame after initial creation (df_processed)-> \n{df_processed}")
if 'structure_type' in df_processed.columns:
df_processed['structure_type'] = df_processed['structure_type'].astype(str).str.lower().str.strip()
print(f"'structure_type' standardized to: '{df_processed['structure_type'].iloc[0]}'")
if numerical_imputer is not None and NUMERICAL_FEATURES:
missing_input = [col for col in NUMERICAL_FEATURES if col not in df_processed.columns]
if missing_input:
raise ValueError(f"Error : Numerical features {missing_input} are missing from input DataFrame!")
#This is only to verify...It will give us the missing columns which should be present while doing numerical imputation....basically, I'm trying to handle all the errors possible
try:
df_processed[NUMERICAL_FEATURES] = numerical_imputer.transform(df_processed[NUMERICAL_FEATURES])
except Exception as e:
raise RuntimeError(
f"Error during numerical imputation\n"
f"Error : {e}"
)
#raise functions are best here because as soon as the error occurs....it will stop the function
if categorical_imputer is not None and CATEGORICAL_FEATURES:
missing_input = [col for col in CATEGORICAL_FEATURES if col not in df_processed.columns]
if missing_input:
raise ValueError(f"Error : Categorical features {missing_input} are missing from input DataFrame!")
try:
df_processed[CATEGORICAL_FEATURES] = categorical_imputer.transform(df_processed[CATEGORICAL_FEATURES])
except Exception as e:
raise RuntimeError(
f"Error during categorical imputation\n"
f"Error : {e}"
)
if le_structure_type is not None and 'structure_type' in df_processed.columns:
try:
df_processed['structure_type'] = le_structure_type.transform(df_processed['structure_type'])
except ValueError as e:
raise ValueError(
f"Unknown category -> '{df_processed['structure_type'].iloc[0]}' in column 'structure_type'\n"
f"Error : {e}"
)
except Exception as e:
raise RuntimeError(f"Error during Label Encoding for 'structure_type'...Error: {e}")
if scaler is not None and NUMERICAL_FEATURES:
missing_input = [col for col in NUMERICAL_FEATURES if col not in df_processed.columns]
if missing_input:
raise ValueError(f"Error : Numerical features {missing_input} are missing from input DataFrame")
try:
df_processed[NUMERICAL_FEATURES] = scaler.transform(df_processed[NUMERICAL_FEATURES])
except Exception as e:
raise RuntimeError(
f"Error during scaling\n"
f"Error: {e}"
)
print(f"Current df_processed columns before final reorder: {df_processed.columns.tolist()}")
#Checkpoint
for col in FINAL_MODEL_EXPECTED_FEATURES:
if col not in df_processed.columns:
print(f"Adding missing column: '{col}' with value 0.")
df_processed[col] = 0
df_final = df_processed[FINAL_MODEL_EXPECTED_FEATURES]
print(f"Final DataFrame for prediction: \n{df_final}")
return df_final
#The function I created above was based upon the numerical and categorical imputation, label encoding, scaling or basically all the data preprocessing that should be done after training all the models.....
#I have show all the error messages in my coding lines because I got stuck in this process many time and to highlight the mistakes I have created some checkpoints also in between....Therefore, now all the data operations are done and the next thing is DEPLOYMENT-> creation of FastAPI and deployment on AWS etc.
|