Spaces:
Runtime error
Runtime error
| #Now, comes another important task that is Preprocessing the data | |
| import pandas as pd | |
| import joblib | |
| import os | |
| from sklearn.impute import SimpleImputer | |
| from sklearn.preprocessing import LabelEncoder, StandardScaler | |
| import requests | |
| GITHUB_BASE_URL = "https://raw.githubusercontent.com/jainkavya738/Electricity-ML-Artifacts/main/" | |
| NUM_IMPUTER_URL = "https://github.com/jainkavya738/Electricity-ML-Artifacts/raw/refs/heads/main/numerical_imputer.pkl" | |
| CAT_IMPUTER_URL = "https://github.com/jainkavya738/Electricity-ML-Artifacts/raw/refs/heads/main/categorical_imputer.pkl" | |
| LE_STRUCTURE_TYPE_URL = "https://github.com/jainkavya738/Electricity-ML-Artifacts/raw/refs/heads/main/label_encoder_structure_type.pkl" | |
| SCALER_URL = "https://github.com/jainkavya738/Electricity-ML-Artifacts/raw/refs/heads/main/scaler.pkl" | |
| MODEL_URL = "https://github.com/jainkavya738/Electricity-ML-Artifacts/raw/refs/heads/main/model.pkl" | |
| def download_and_load_pkl(url, filename): | |
| print(f"Attempting to download {filename} from {url}") | |
| temp_filepath = None | |
| try: | |
| response = requests.get(url, stream=True) | |
| response.raise_for_status() | |
| temp_filepath = os.path.join('/tmp', filename) | |
| with open(temp_filepath, 'wb') as f: | |
| for chunk in response.iter_content(chunk_size=8192): | |
| f.write(chunk) | |
| obj = joblib.load(temp_filepath) | |
| print(f"Successfully downloaded and loaded {filename}") | |
| os.remove(temp_filepath) | |
| return obj | |
| except requests.exceptions.RequestException as e: | |
| print(f"Error downloading {temp_filepath} from {url}: {e}") | |
| return None | |
| except Exception as e: | |
| print(f"Error loading {temp_filepath} after download: {e}") | |
| return None | |
| finally: | |
| if temp_filepath and os.path.exists(temp_filepath): | |
| try: | |
| os.remove(temp_filepath) | |
| print(f"Cleaned up temporary file: {temp_filepath}") | |
| except OSError as e: | |
| print(f"Warning: Could not remove temporary file {temp_filepath}: {e}") | |
| numerical_imputer = download_and_load_pkl(NUM_IMPUTER_URL, "numerical_imputer.pkl") | |
| categorical_imputer = download_and_load_pkl(CAT_IMPUTER_URL, "categorical_imputer.pkl") | |
| le_structure_type = download_and_load_pkl(LE_STRUCTURE_TYPE_URL, "label_encoder_structure_type.pkl") | |
| scaler = download_and_load_pkl(SCALER_URL, "scaler.pkl") | |
| model = download_and_load_pkl(MODEL_URL, "model.pkl") | |
| #I have done this to set them as a placeholder in this file....therefore no discrepancies related to it will occur | |
| #You can see that I've used the try and except model for loading the data so that if error occurs I'm completely aware of it | |
| NUMERICAL_FEATURES = [ | |
| 'site_area', 'water_consumption', 'recycling_rate', 'utilisation_rate', 'air_qality_index', 'issue_reolution_time', 'resident_count' | |
| ] | |
| CATEGORICAL_FEATURES = ['structure_type'] | |
| FINAL_MODEL_EXPECTED_FEATURES = [ | |
| 'site_area', 'water_consumption', 'recycling_rate', 'utilisation_rate', | |
| 'air_qality_index', 'issue_reolution_time', 'resident_count', 'structure_type' | |
| ] | |
| #Final model expected features contains the list of the final output of the trained data | |
| #Now, our input will most likely be a dictionary...but for MLOps we would be needing a Pandas datframe so I converted this input dictionary into a dataframe and then returned it to my function after performing operation -> | |
| def preprocess_input(input_data: dict) -> pd.DataFrame: | |
| df_processed = pd.DataFrame([input_data]) | |
| print(f"DataFrame after initial creation (df_processed)-> \n{df_processed}") | |
| if 'structure_type' in df_processed.columns: | |
| df_processed['structure_type'] = df_processed['structure_type'].astype(str).str.lower().str.strip() | |
| print(f"'structure_type' standardized to: '{df_processed['structure_type'].iloc[0]}'") | |
| if numerical_imputer is not None and NUMERICAL_FEATURES: | |
| missing_input = [col for col in NUMERICAL_FEATURES if col not in df_processed.columns] | |
| if missing_input: | |
| raise ValueError(f"Error : Numerical features {missing_input} are missing from input DataFrame!") | |
| #This is only to verify...It will give us the missing columns which should be present while doing numerical imputation....basically, I'm trying to handle all the errors possible | |
| try: | |
| df_processed[NUMERICAL_FEATURES] = numerical_imputer.transform(df_processed[NUMERICAL_FEATURES]) | |
| except Exception as e: | |
| raise RuntimeError( | |
| f"Error during numerical imputation\n" | |
| f"Error : {e}" | |
| ) | |
| #raise functions are best here because as soon as the error occurs....it will stop the function | |
| if categorical_imputer is not None and CATEGORICAL_FEATURES: | |
| missing_input = [col for col in CATEGORICAL_FEATURES if col not in df_processed.columns] | |
| if missing_input: | |
| raise ValueError(f"Error : Categorical features {missing_input} are missing from input DataFrame!") | |
| try: | |
| df_processed[CATEGORICAL_FEATURES] = categorical_imputer.transform(df_processed[CATEGORICAL_FEATURES]) | |
| except Exception as e: | |
| raise RuntimeError( | |
| f"Error during categorical imputation\n" | |
| f"Error : {e}" | |
| ) | |
| if le_structure_type is not None and 'structure_type' in df_processed.columns: | |
| try: | |
| df_processed['structure_type'] = le_structure_type.transform(df_processed['structure_type']) | |
| except ValueError as e: | |
| raise ValueError( | |
| f"Unknown category -> '{df_processed['structure_type'].iloc[0]}' in column 'structure_type'\n" | |
| f"Error : {e}" | |
| ) | |
| except Exception as e: | |
| raise RuntimeError(f"Error during Label Encoding for 'structure_type'...Error: {e}") | |
| if scaler is not None and NUMERICAL_FEATURES: | |
| missing_input = [col for col in NUMERICAL_FEATURES if col not in df_processed.columns] | |
| if missing_input: | |
| raise ValueError(f"Error : Numerical features {missing_input} are missing from input DataFrame") | |
| try: | |
| df_processed[NUMERICAL_FEATURES] = scaler.transform(df_processed[NUMERICAL_FEATURES]) | |
| except Exception as e: | |
| raise RuntimeError( | |
| f"Error during scaling\n" | |
| f"Error: {e}" | |
| ) | |
| print(f"Current df_processed columns before final reorder: {df_processed.columns.tolist()}") | |
| #Checkpoint | |
| for col in FINAL_MODEL_EXPECTED_FEATURES: | |
| if col not in df_processed.columns: | |
| print(f"Adding missing column: '{col}' with value 0.") | |
| df_processed[col] = 0 | |
| df_final = df_processed[FINAL_MODEL_EXPECTED_FEATURES] | |
| print(f"Final DataFrame for prediction: \n{df_final}") | |
| return df_final | |
| #The function I created above was based upon the numerical and categorical imputation, label encoding, scaling or basically all the data preprocessing that should be done after training all the models..... | |
| #I have show all the error messages in my coding lines because I got stuck in this process many time and to highlight the mistakes I have created some checkpoints also in between....Therefore, now all the data operations are done and the next thing is DEPLOYMENT-> creation of FastAPI and deployment on AWS etc. | |