Spaces:

Kavya-Jain
/

Electricity-Cost-Predictor-FastAPI

Runtime error

App Files Files Community

Kavya-Jain commited on Aug 1, 2025

Commit

a603065

verified ·

1 Parent(s): 6abfac2

Upload 7 files

Browse files

Files changed (5) hide show

.dockerignore +7 -6
Preprocessing.py +65 -69
main.py +3 -11
requirements.txt +2 -1
train_and_save_model.py +52 -51

.dockerignore CHANGED Viewed

@@ -1,7 +1,8 @@
-.git
-.venv
-__pycache__
-*.pyc
 *.ipynb
-.DS_Store
-*.log

+.git
+.venv
+__pycache__
+*.pyc
 *.ipynb
+.DS_Store
+*.log
+*.pkl

Preprocessing.py CHANGED Viewed

@@ -4,49 +4,44 @@ import joblib
 import os
 from sklearn.impute import SimpleImputer
 from sklearn.preprocessing import LabelEncoder, StandardScaler
-NUM_IMPUTER_PATH = "numerical_imputer.pkl"
-CAT_IMPUTER_PATH = "categorical_imputer.pkl"
-LE_STRUCTURE_TYPE_PATH = "label_encoder_structure_type.pkl"
-SCALER_PATH = "scaler.pkl"
-numerical_imputer = None
-categorical_imputer = None
-le_structure_type = None
-scaler = None
-#I have done this to set them as a placeholder in this file....therefore no discrepancies related to it will occur
-try:
-    numerical_imputer = joblib.load(NUM_IMPUTER_PATH)
-    print(f"Loaded {NUM_IMPUTER_PATH}. Expected features: {getattr(numerical_imputer, 'feature_names_in_', 'N/A')}")
-except FileNotFoundError :
-    print(f"Warning : {NUM_IMPUTER_PATH} not found")
-except Exception as e :
-    print(f"Error loading {NUM_IMPUTER_PATH}: {e}")
-try:
-    categorical_imputer = joblib.load(CAT_IMPUTER_PATH)
-    print(f"Loaded {CAT_IMPUTER_PATH}. Expected features: {getattr(categorical_imputer, 'feature_names_in_', 'N/A')}")
-except FileNotFoundError :
-    print(f"Warning: {CAT_IMPUTER_PATH} not found")
-except Exception as e :
-    print(f"Error loading {CAT_IMPUTER_PATH}: {e}")
-try:
-    le_structure_type = joblib.load(LE_STRUCTURE_TYPE_PATH)
-    print(f"Loaded {LE_STRUCTURE_TYPE_PATH}")
-except FileNotFoundError :
-    print(f"Warning: {LE_STRUCTURE_TYPE_PATH} not found")
-except Exception as e :
-    print(f"Error loading {LE_STRUCTURE_TYPE_PATH}: {e}")
-try:
-    scaler = joblib.load(SCALER_PATH)
-    print(f"Loaded {SCALER_PATH}. Expected features: {getattr(scaler, 'feature_names_in_', 'N/A')}")
-except FileNotFoundError :
-    print(f"Warning: {SCALER_PATH} not found")
-except Exception as e :
-    print(f"Error loading {SCALER_PATH}: {e}")
 #You can see that I've used the try and except model for loading the data so that if error occurs I'm completely aware of it
@@ -56,7 +51,8 @@ NUMERICAL_FEATURES = [
 CATEGORICAL_FEATURES = ['structure_type']
 FINAL_MODEL_EXPECTED_FEATURES = [
-    'site_area', 'structure_type', 'water_consumption', 'recycling_rate', 'utilisation_rate', 'air_qality_index', 'issue_reolution_time', 'resident_count'
 ]
 #Final model expected features contains the list of the final output of the trained data
@@ -69,14 +65,14 @@ def preprocess_input(input_data: dict) -> pd.DataFrame:
     if 'structure_type' in df_processed.columns:
         df_processed['structure_type'] = df_processed['structure_type'].astype(str).str.lower().str.strip()
         print(f"'structure_type' standardized to: '{df_processed['structure_type'].iloc[0]}'")
     if numerical_imputer is not None and NUMERICAL_FEATURES:
         missing_input = [col for col in NUMERICAL_FEATURES if col not in df_processed.columns]
         if missing_input:
             raise ValueError(f"Error : Numerical features {missing_input} are missing from input DataFrame!")
-        #This is only to verify...It will give us the missing columns which should  be present while doing numerical imputation....basically, I'm trying to handle all the errors possible
         try:
             df_processed[NUMERICAL_FEATURES] = numerical_imputer.transform(df_processed[NUMERICAL_FEATURES])
@@ -89,17 +85,17 @@ def preprocess_input(input_data: dict) -> pd.DataFrame:
     if categorical_imputer is not None and CATEGORICAL_FEATURES:
         missing_input = [col for col in CATEGORICAL_FEATURES if col not in df_processed.columns]
-        if missing_input:
-            raise ValueError(f"Error : Categorical features {missing_input} are missing from input DataFrame!")
-        try:
-            df_processed[CATEGORICAL_FEATURES] = categorical_imputer.transform(df_processed[CATEGORICAL_FEATURES])
-        except Exception as e:
-            raise RuntimeError(
-                f"Error during categorical imputation\n"
-                f"Error : {e}"
-            )
     if le_structure_type is not None and 'structure_type' in df_processed.columns:
         try:
@@ -114,18 +110,18 @@ def preprocess_input(input_data: dict) -> pd.DataFrame:
     if scaler is not None and NUMERICAL_FEATURES:
         missing_input = [col for col in NUMERICAL_FEATURES if col not in df_processed.columns]
-        if missing_input:
-            raise ValueError(f"Error : Numerical features {missing_input} are missing from input DataFrame")
-        try:
-            df_processed[NUMERICAL_FEATURES] = scaler.transform(df_processed[NUMERICAL_FEATURES])
-        except Exception as e:
-            raise RuntimeError(
-                f"Error during scaling\n"
-                f"Error: {e}"
-            )
     print(f"Current df_processed columns before final reorder: {df_processed.columns.tolist()}")
     #Checkpoint
@@ -141,4 +137,4 @@ def preprocess_input(input_data: dict) -> pd.DataFrame:
     return df_final
 #The function I created above was based upon the numerical and categorical imputation, label encoding, scaling or basically all the data preprocessing that should be done after training all the models.....
-#I have show all the error messages in my coding lines because I got stuck in this process many time and to highlight the mistakes I have created some checkpoints also in between....Therefore, now all the data operations are done and the next thing is DEPLOYMENT-> creation of FastAPI and deployment on AWS etc.

 import os
 from sklearn.impute import SimpleImputer
 from sklearn.preprocessing import LabelEncoder, StandardScaler
+import requests
+GITHUB_BASE_URL = "https://raw.githubusercontent.com/jainkavya738/Electricity-ML-Artifacts/main/"
+NUM_IMPUTER_URL = "https://github.com/jainkavya738/Electricity-ML-Artifacts/raw/refs/heads/main/numerical_imputer.pkl"
+CAT_IMPUTER_URL = "https://github.com/jainkavya738/Electricity-ML-Artifacts/raw/refs/heads/main/categorical_imputer.pkl"
+LE_STRUCTURE_TYPE_URL = "https://github.com/jainkavya738/Electricity-ML-Artifacts/raw/refs/heads/main/label_encoder_structure_type.pkl "
+SCALER_URL = "https://github.com/jainkavya738/Electricity-ML-Artifacts/raw/refs/heads/main/scaler.pkl"
+MODEL_URL = "https://github.com/jainkavya738/Electricity-ML-Artifacts/raw/refs/heads/main/model.pkl"
+def download_and_load_pkl(url, filename):
+    print(f"Attempting to download {filename} from {url}")
+    try:
+        response = requests.get(url, stream=True)
+        response.raise_for_status()
+        with open(filename, 'wb') as f:
+            for chunk in response.iter_content(chunk_size=8192):
+                f.write(chunk)
+        obj = joblib.load(filename)
+        print(f"Successfully downloaded and loaded {filename}")
+        os.remove(filename)
+        return obj
+    except requests.exceptions.RequestException as e:
+        print(f"Error downloading {filename} from {url}: {e}")
+        return None
+    except Exception as e:
+        print(f"Error loading {filename} after download: {e}")
+        return None
+numerical_imputer = download_and_load_pkl(NUM_IMPUTER_URL, "numerical_imputer.pkl")
+categorical_imputer = download_and_load_pkl(CAT_IMPUTER_URL, "categorical_imputer.pkl")
+le_structure_type = download_and_load_pkl(LE_STRUCTURE_TYPE_URL, "label_encoder_structure_type.pkl")
+scaler = download_and_load_pkl(SCALER_URL, "scaler.pkl")
+model = download_and_load_pkl(MODEL_URL, "model.pkl")
+#I have done this to set them as a placeholder in this file....therefore no discrepancies related to it will occur
 #You can see that I've used the try and except model for loading the data so that if error occurs I'm completely aware of it
 CATEGORICAL_FEATURES = ['structure_type']
 FINAL_MODEL_EXPECTED_FEATURES = [
+    'site_area', 'water_consumption', 'recycling_rate', 'utilisation_rate',
+    'air_qality_index', 'issue_reolution_time', 'resident_count', 'structure_type'
 ]
 #Final model expected features contains the list of the final output of the trained data
     if 'structure_type' in df_processed.columns:
         df_processed['structure_type'] = df_processed['structure_type'].astype(str).str.lower().str.strip()
         print(f"'structure_type' standardized to: '{df_processed['structure_type'].iloc[0]}'")
     if numerical_imputer is not None and NUMERICAL_FEATURES:
         missing_input = [col for col in NUMERICAL_FEATURES if col not in df_processed.columns]
         if missing_input:
             raise ValueError(f"Error : Numerical features {missing_input} are missing from input DataFrame!")
+        #This is only to verify...It will give us the missing columns which should  be present while doing numerical imputation....basically, I'm trying to handle all the errors possible
         try:
             df_processed[NUMERICAL_FEATURES] = numerical_imputer.transform(df_processed[NUMERICAL_FEATURES])
     if categorical_imputer is not None and CATEGORICAL_FEATURES:
         missing_input = [col for col in CATEGORICAL_FEATURES if col not in df_processed.columns]
+    if missing_input:
+        raise ValueError(f"Error : Categorical features {missing_input} are missing from input DataFrame!")
+    try:
+        df_processed[CATEGORICAL_FEATURES] = categorical_imputer.transform(df_processed[CATEGORICAL_FEATURES])
+    except Exception as e:
+        raise RuntimeError(
+            f"Error during categorical imputation\n"
+            f"Error : {e}"
+        )
     if le_structure_type is not None and 'structure_type' in df_processed.columns:
         try:
     if scaler is not None and NUMERICAL_FEATURES:
         missing_input = [col for col in NUMERICAL_FEATURES if col not in df_processed.columns]
+    if missing_input:
+        raise ValueError(f"Error : Numerical features {missing_input} are missing from input DataFrame")
+    try:
+        df_processed[NUMERICAL_FEATURES] = scaler.transform(df_processed[NUMERICAL_FEATURES])
+    except Exception as e:
+        raise RuntimeError(
+            f"Error during scaling\n"
+            f"Error: {e}"
+        )
     print(f"Current df_processed columns before final reorder: {df_processed.columns.tolist()}")
     #Checkpoint
     return df_final
 #The function I created above was based upon the numerical and categorical imputation, label encoding, scaling or basically all the data preprocessing that should be done after training all the models.....
+#I have show all the error messages in my coding lines because I got stuck in this process many time and to highlight the mistakes I have created some checkpoints also in between....Therefore, now all the data operations are done and the next thing is DEPLOYMENT-> creation of FastAPI and deployment on AWS etc.

main.py CHANGED Viewed

@@ -4,23 +4,15 @@ import joblib
 import pandas as pd
 import os
-from Preprocessing import preprocess_input
 app = FastAPI(
     title="Electricity Cost Prediction API",
     description="Predicts electricity cost based on facility and operational parameters"
 )
-MODEL_PATH = "model.pkl"
-if not os.path.exists(MODEL_PATH):
-    raise FileNotFoundError(
-        "Model file not found"
-    )
-try:
-    model = joblib.load(MODEL_PATH)
-except Exception as e:
-    raise RuntimeError(f"Error loading model from {MODEL_PATH}: {e}")
 class ElectricityInput(BaseModel):
     site_area: float = Field(..., description="Area of the site in square units")

 import pandas as pd
 import os
+from Preprocessing import preprocess_input,model
 app = FastAPI(
     title="Electricity Cost Prediction API",
     description="Predicts electricity cost based on facility and operational parameters"
 )
+if model is None:
+    raise RuntimeError("Critical Error: ML model failed to load from external source during application startup.")
 class ElectricityInput(BaseModel):
     site_area: float = Field(..., description="Area of the site in square units")

requirements.txt CHANGED Viewed

@@ -5,4 +5,5 @@ scikit-learn
 joblib
 pydantic
 gunicorn
-openpyxl

 joblib
 pydantic
 gunicorn
+openpyxl
+requests

train_and_save_model.py CHANGED Viewed

@@ -8,62 +8,63 @@ from sklearn.preprocessing import LabelEncoder, StandardScaler
 from sklearn.model_selection import train_test_split
 from sklearn.linear_model import LinearRegression
 import joblib
-import os
 import re
-DATASET_PATH = "C:/Users/kavya/Documents/GDG_Files_Kavya/electricity_predictor_API/electricity_cost_dataset.csv.xlsx"
-MODEL_OUTPUT_DIR = "."
 os.makedirs(MODEL_OUTPUT_DIR, exist_ok=True)
 def RenamingColumns(Column_Name):
-    Column_Name = re.sub(r'\s+', '_', Column_Name)
-    Column_Name = re.sub(r'[^\w_]', '', Column_Name)
-    return Column_Name.lower()
 try:
-    df = pd.read_excel(DATASET_PATH)
-    print("Original columns ->\n")
-    print(df.columns.tolist())
-    new_columns = []
-    #As I've to rename the columns....I'm using a for loop to do this->
-    #If, the column names given as an input in the FastAPI are not same as the column names in the dataset...an error will be occured on the fastAPI application
-    for col in df.columns:
-        new_col = RenamingColumns(col)
-        new_columns.append(new_col)
-    df.columns = new_columns
-    print("Renamed Columns ->\n")
-    print(df.columns.tolist())
 except FileNotFoundError:
-    print(f"Error: Dataset not found! Please ensure the file is in the same directory")
-    exit()
 except Exception as e:
-    print(f"Error : {e}")
-    exit()
 #I used try and except blocks for ERROR HANDLING
 #Now, all the names have been changed and I've converted same as the datset ones...Therefor from here, I've used new names
 TARGET_COL = 'electricity_cost'
 if TARGET_COL not in df.columns:
-    print(f"Error: Target column '{TARGET_COL}' not found!")
-    exit()
 features_df = df.drop(columns=[TARGET_COL])
 #Using .drop, I removed the feature which will not be used in calculation
 y = df[TARGET_COL]
 NUMERICAL_FEATURES = [
-    'site_area', 'water_consumption', 'recycling_rate', 'utilisation_rate',
-    'air_qality_index', 'issue_reolution_time', 'resident_count'
 ]
 CATEGORICAL_FEATURES = ['structure_type']
@@ -71,44 +72,44 @@ all_expected_features = NUMERICAL_FEATURES + CATEGORICAL_FEATURES
 missing_features = [col for col in all_expected_features if col not in features_df.columns]
 if missing_features:
-    print(f"Error: The following expected features are missing from the data after renaming: {missing_features}")
-    exit()
 #The above steps were only for the safety purpose...to recheck if there is any missing features.
 #Actually, I did it just because I was facing many errors...therefore just to check I added some checkpoints.
 numerical_imputer = SimpleImputer(strategy='mean')
 if NUMERICAL_FEATURES:
-    features_df[NUMERICAL_FEATURES] = numerical_imputer.fit_transform(features_df[NUMERICAL_FEATURES])
-    joblib.dump(numerical_imputer, os.path.join(MODEL_OUTPUT_DIR, 'numerical_imputer.pkl'))
-    print("Numerical imputer fitted and saved")
 else:
-    print("No numerical columns to impute")
 categorical_imputer = SimpleImputer(strategy='most_frequent')
 if CATEGORICAL_FEATURES:
-    features_df[CATEGORICAL_FEATURES] = categorical_imputer.fit_transform(features_df[CATEGORICAL_FEATURES])
-    joblib.dump(categorical_imputer, os.path.join(MODEL_OUTPUT_DIR, 'categorical_imputer.pkl'))
-    print("Categorical imputer fitted and saved")
 else:
-    print("No categorical columns to impute")
 #I used joblib because I wanted to use this data later as well...therefore, whenever I will be in need of it I will load this with joblib.load()
 if 'structure_type' in features_df.columns:
-    features_df['structure_type'] = features_df['structure_type'].astype(str).str.lower().str.strip()
-    le_structure_type = LabelEncoder()
-    features_df['structure_type'] = le_structure_type.fit_transform(features_df['structure_type'])
-    joblib.dump(le_structure_type, os.path.join(MODEL_OUTPUT_DIR, 'label_encoder_structure_type.pkl'))
-    print("LabelEncoder for 'structure_type' fitted and saved.")
 else:
-    print("structure_type column not found or not categorical, skipping LabelEncoder.")
 if NUMERICAL_FEATURES:
-    scaler = StandardScaler()
-    features_df[NUMERICAL_FEATURES] = scaler.fit_transform(features_df[NUMERICAL_FEATURES])
-    joblib.dump(scaler, os.path.join(MODEL_OUTPUT_DIR, 'scaler.pkl'))
-    print("StandardScaler fitted and saved.")
 else:
-    print("No numerical columns to scale.")
 #You can see that, I've used joblib.dump to create a separate directory for each imputer and encoder made

 from sklearn.model_selection import train_test_split
 from sklearn.linear_model import LinearRegression
 import joblib
+import os
 import re
+# Changed to a relative path for better portability on deployment platforms
+DATASET_PATH = "electricity_cost_dataset.csv.xlsx"
+MODEL_OUTPUT_DIR = "."
 os.makedirs(MODEL_OUTPUT_DIR, exist_ok=True)
 def RenamingColumns(Column_Name):
+ Column_Name = re.sub(r'\s+', '_', Column_Name)
+ Column_Name = re.sub(r'[^\w_]', '', Column_Name)
+ return Column_Name.lower()
 try:
+ df = pd.read_excel(DATASET_PATH)
+ print("Original columns ->\n")
+ print(df.columns.tolist())
+ new_columns = []
+ #As I've to rename the columns....I'm using a for loop to do this->
+ #If, the column names given as an input in the FastAPI are not same as the column names in the dataset...an error will be occured on the fastAPI application
+ for col in df.columns:
+    new_col = RenamingColumns(col)
+    new_columns.append(new_col)
+ df.columns = new_columns
+ print("Renamed Columns ->\n")
+ print(df.columns.tolist())
 except FileNotFoundError:
+ print(f"Error: Dataset not found! Please ensure the file is in the same directory")
+ exit()
 except Exception as e:
+ print(f"Error : {e}")
+ exit()
 #I used try and except blocks for ERROR HANDLING
 #Now, all the names have been changed and I've converted same as the datset ones...Therefor from here, I've used new names
 TARGET_COL = 'electricity_cost'
 if TARGET_COL not in df.columns:
+ print(f"Error: Target column '{TARGET_COL}' not found!")
+ exit()
 features_df = df.drop(columns=[TARGET_COL])
 #Using .drop, I removed the feature which will not be used in calculation
 y = df[TARGET_COL]
 NUMERICAL_FEATURES = [
+ 'site_area', 'water_consumption', 'recycling_rate', 'utilisation_rate',
+ 'air_qality_index', 'issue_reolution_time', 'resident_count'
 ]
 CATEGORICAL_FEATURES = ['structure_type']
 missing_features = [col for col in all_expected_features if col not in features_df.columns]
 if missing_features:
+ print(f"Error: The following expected features are missing from the data after renaming: {missing_features}")
+ exit()
 #The above steps were only for the safety purpose...to recheck if there is any missing features.
 #Actually, I did it just because I was facing many errors...therefore just to check I added some checkpoints.
 numerical_imputer = SimpleImputer(strategy='mean')
 if NUMERICAL_FEATURES:
+ features_df[NUMERICAL_FEATURES] = numerical_imputer.fit_transform(features_df[NUMERICAL_FEATURES])
+ joblib.dump(numerical_imputer, os.path.join(MODEL_OUTPUT_DIR, 'numerical_imputer.pkl'))
+ print("Numerical imputer fitted and saved")
 else:
+ print("No numerical columns to impute")
 categorical_imputer = SimpleImputer(strategy='most_frequent')
 if CATEGORICAL_FEATURES:
+ features_df[CATEGORICAL_FEATURES] = categorical_imputer.fit_transform(features_df[CATEGORICAL_FEATURES])
+ joblib.dump(categorical_imputer, os.path.join(MODEL_OUTPUT_DIR, 'categorical_imputer.pkl'))
+ print("Categorical imputer fitted and saved")
 else:
+ print("No categorical columns to impute")
 #I used joblib because I wanted to use this data later as well...therefore, whenever I will be in need of it I will load this with joblib.load()
 if 'structure_type' in features_df.columns:
+ features_df['structure_type'] = features_df['structure_type'].astype(str).str.lower().str.strip()
+ le_structure_type = LabelEncoder()
+ features_df['structure_type'] = le_structure_type.fit_transform(features_df['structure_type'])
+ joblib.dump(le_structure_type, os.path.join(MODEL_OUTPUT_DIR, 'label_encoder_structure_type.pkl'))
+ print("LabelEncoder for 'structure_type' fitted and saved.")
 else:
+ print("structure_type column not found or not categorical, skipping LabelEncoder.")
 if NUMERICAL_FEATURES:
+ scaler = StandardScaler()
+ features_df[NUMERICAL_FEATURES] = scaler.fit_transform(features_df[NUMERICAL_FEATURES])
+ joblib.dump(scaler, os.path.join(MODEL_OUTPUT_DIR, 'scaler.pkl'))
+ print("StandardScaler fitted and saved.")
 else:
+ print("No numerical columns to scale.")
 #You can see that, I've used joblib.dump to create a separate directory for each imputer and encoder made