File size: 7,527 Bytes
6abfac2
 
 
 
 
 
a603065
6abfac2
a603065
6abfac2
a603065
 
ad2e526
a603065
 
 
 
 
ad2e526
 
 
a603065
 
203ea81
 
 
 
 
a603065
 
 
203ea81
a603065
203ea81
a603065
ad2e526
a603065
203ea81
a603065
ad2e526
a603065
203ea81
a603065
ad2e526
 
 
 
 
 
 
 
a603065
 
 
 
 
 
6abfac2
a603065
6abfac2
 
 
 
 
 
 
 
 
a603065
 
6abfac2
 
 
 
 
 
 
 
 
 
 
 
a603065
6abfac2
 
a603065
6abfac2
 
a603065
 
6abfac2
 
 
 
 
 
 
 
 
 
 
 
a603065
 
 
 
 
 
 
 
 
 
 
6abfac2
 
 
 
 
 
 
 
 
 
 
 
 
 
a603065
 
 
 
 
 
 
 
 
 
 
 
6abfac2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a603065
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
#Now, comes another important task that is Preprocessing the data
import pandas as pd
import joblib
import os
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
import requests 

GITHUB_BASE_URL = "https://raw.githubusercontent.com/jainkavya738/Electricity-ML-Artifacts/main/" 

NUM_IMPUTER_URL = "https://github.com/jainkavya738/Electricity-ML-Artifacts/raw/refs/heads/main/numerical_imputer.pkl"
CAT_IMPUTER_URL = "https://github.com/jainkavya738/Electricity-ML-Artifacts/raw/refs/heads/main/categorical_imputer.pkl"
LE_STRUCTURE_TYPE_URL = "https://github.com/jainkavya738/Electricity-ML-Artifacts/raw/refs/heads/main/label_encoder_structure_type.pkl"
SCALER_URL = "https://github.com/jainkavya738/Electricity-ML-Artifacts/raw/refs/heads/main/scaler.pkl"
MODEL_URL = "https://github.com/jainkavya738/Electricity-ML-Artifacts/raw/refs/heads/main/model.pkl" 

def download_and_load_pkl(url, filename):
    print(f"Attempting to download {filename} from {url}")
    
    temp_filepath = None 
    
    try:
        response = requests.get(url, stream=True)
        response.raise_for_status()
        
        temp_filepath = os.path.join('/tmp', filename)
         
        with open(temp_filepath, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        
        obj = joblib.load(temp_filepath)
        print(f"Successfully downloaded and loaded {filename}")
        os.remove(temp_filepath) 
        return obj
    
    except requests.exceptions.RequestException as e:
        print(f"Error downloading {temp_filepath} from {url}: {e}")
        return None 
    
    except Exception as e:
        print(f"Error loading {temp_filepath} after download: {e}")
        return None 
    
    finally: 
        if temp_filepath and os.path.exists(temp_filepath):
            try:
                os.remove(temp_filepath)
                print(f"Cleaned up temporary file: {temp_filepath}")
            except OSError as e:
                print(f"Warning: Could not remove temporary file {temp_filepath}: {e}")

numerical_imputer = download_and_load_pkl(NUM_IMPUTER_URL, "numerical_imputer.pkl")
categorical_imputer = download_and_load_pkl(CAT_IMPUTER_URL, "categorical_imputer.pkl")
le_structure_type = download_and_load_pkl(LE_STRUCTURE_TYPE_URL, "label_encoder_structure_type.pkl")
scaler = download_and_load_pkl(SCALER_URL, "scaler.pkl")
model = download_and_load_pkl(MODEL_URL, "model.pkl") 

#I have done this to set them as a placeholder in this file....therefore no discrepancies related to it will occur

#You can see that I've used the try and except model for loading the data so that if error occurs I'm completely aware of it

NUMERICAL_FEATURES = [
    'site_area', 'water_consumption', 'recycling_rate', 'utilisation_rate', 'air_qality_index', 'issue_reolution_time', 'resident_count'
]
CATEGORICAL_FEATURES = ['structure_type']

FINAL_MODEL_EXPECTED_FEATURES = [
    'site_area', 'water_consumption', 'recycling_rate', 'utilisation_rate',
    'air_qality_index', 'issue_reolution_time', 'resident_count', 'structure_type'
] 
#Final model expected features contains the list of the final output of the trained data

#Now, our input will most likely be a dictionary...but for MLOps we would be needing a Pandas datframe so I converted this input dictionary into a dataframe and then returned it to my function after performing operation ->

def preprocess_input(input_data: dict) -> pd.DataFrame:
    df_processed = pd.DataFrame([input_data])
    print(f"DataFrame after initial creation (df_processed)-> \n{df_processed}")

    if 'structure_type' in df_processed.columns:
        df_processed['structure_type'] = df_processed['structure_type'].astype(str).str.lower().str.strip()
        print(f"'structure_type' standardized to: '{df_processed['structure_type'].iloc[0]}'")

    if numerical_imputer is not None and NUMERICAL_FEATURES:
        missing_input = [col for col in NUMERICAL_FEATURES if col not in df_processed.columns] 
    
        if missing_input:
            raise ValueError(f"Error : Numerical features {missing_input} are missing from input DataFrame!")
        #This is only to verify...It will give us the missing columns which should  be present while doing numerical imputation....basically, I'm trying to handle all the errors possible
    
        try:
            df_processed[NUMERICAL_FEATURES] = numerical_imputer.transform(df_processed[NUMERICAL_FEATURES])

        except Exception as e:
            raise RuntimeError(
                f"Error during numerical imputation\n"
                f"Error : {e}"
            )
            #raise functions are best here because as soon as the error occurs....it will stop the function

    if categorical_imputer is not None and CATEGORICAL_FEATURES:
        missing_input = [col for col in CATEGORICAL_FEATURES if col not in df_processed.columns]

    if missing_input:
        raise ValueError(f"Error : Categorical features {missing_input} are missing from input DataFrame!")
    try:
        df_processed[CATEGORICAL_FEATURES] = categorical_imputer.transform(df_processed[CATEGORICAL_FEATURES])

    except Exception as e:
        raise RuntimeError(
            f"Error during categorical imputation\n"
            f"Error : {e}"
        )

    if le_structure_type is not None and 'structure_type' in df_processed.columns:
        try:
            df_processed['structure_type'] = le_structure_type.transform(df_processed['structure_type'])
        except ValueError as e:
            raise ValueError(
                f"Unknown category -> '{df_processed['structure_type'].iloc[0]}' in column 'structure_type'\n"
                f"Error : {e}"
            )
        except Exception as e:
            raise RuntimeError(f"Error during Label Encoding for 'structure_type'...Error: {e}")

    if scaler is not None and NUMERICAL_FEATURES:
        missing_input = [col for col in NUMERICAL_FEATURES if col not in df_processed.columns]

    if missing_input:
        raise ValueError(f"Error : Numerical features {missing_input} are missing from input DataFrame")

    try:
        df_processed[NUMERICAL_FEATURES] = scaler.transform(df_processed[NUMERICAL_FEATURES])

    except Exception as e:
        raise RuntimeError(
            f"Error during scaling\n"
            f"Error: {e}"
        )

    print(f"Current df_processed columns before final reorder: {df_processed.columns.tolist()}")
    #Checkpoint

    for col in FINAL_MODEL_EXPECTED_FEATURES:
        if col not in df_processed.columns:
            print(f"Adding missing column: '{col}' with value 0.")
            df_processed[col] = 0

    df_final = df_processed[FINAL_MODEL_EXPECTED_FEATURES] 
    print(f"Final DataFrame for prediction: \n{df_final}")

    return df_final

#The function I created above was based upon the numerical and categorical imputation, label encoding, scaling or basically all the data preprocessing that should be done after training all the models.....
#I have show all the error messages in my coding lines because I got stuck in this process many time and to highlight the mistakes I have created some checkpoints also in between....Therefore, now all the data operations are done and the next thing is DEPLOYMENT-> creation of FastAPI and deployment on AWS etc.