Spaces:
Runtime error
Runtime error
| #Next Task -> Training the dataset | |
| #In this file I've done Training and encoding the dataset | |
| #Now as I've already done the EDA...the next task is to train and save the data for preprocessing | |
| import pandas as pd | |
| from sklearn.impute import SimpleImputer | |
| from sklearn.preprocessing import LabelEncoder, StandardScaler | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.linear_model import LinearRegression | |
| import joblib | |
| import os | |
| import re | |
| # Changed to a relative path for better portability on deployment platforms | |
| DATASET_PATH = "electricity_cost_dataset.csv.xlsx" | |
| MODEL_OUTPUT_DIR = "." | |
| os.makedirs(MODEL_OUTPUT_DIR, exist_ok=True) | |
| def RenamingColumns(Column_Name): | |
| Column_Name = re.sub(r'\s+', '_', Column_Name) | |
| Column_Name = re.sub(r'[^\w_]', '', Column_Name) | |
| return Column_Name.lower() | |
| try: | |
| df = pd.read_excel(DATASET_PATH) | |
| print("Original columns ->\n") | |
| print(df.columns.tolist()) | |
| new_columns = [] | |
| #As I've to rename the columns....I'm using a for loop to do this-> | |
| #If, the column names given as an input in the FastAPI are not same as the column names in the dataset...an error will be occured on the fastAPI application | |
| for col in df.columns: | |
| new_col = RenamingColumns(col) | |
| new_columns.append(new_col) | |
| df.columns = new_columns | |
| print("Renamed Columns ->\n") | |
| print(df.columns.tolist()) | |
| except FileNotFoundError: | |
| print(f"Error: Dataset not found! Please ensure the file is in the same directory") | |
| exit() | |
| except Exception as e: | |
| print(f"Error : {e}") | |
| exit() | |
| #I used try and except blocks for ERROR HANDLING | |
| #Now, all the names have been changed and I've converted same as the datset ones...Therefor from here, I've used new names | |
| TARGET_COL = 'electricity_cost' | |
| if TARGET_COL not in df.columns: | |
| print(f"Error: Target column '{TARGET_COL}' not found!") | |
| exit() | |
| features_df = df.drop(columns=[TARGET_COL]) | |
| #Using .drop, I removed the feature which will not be used in calculation | |
| y = df[TARGET_COL] | |
| NUMERICAL_FEATURES = [ | |
| 'site_area', 'water_consumption', 'recycling_rate', 'utilisation_rate', | |
| 'air_qality_index', 'issue_reolution_time', 'resident_count' | |
| ] | |
| CATEGORICAL_FEATURES = ['structure_type'] | |
| all_expected_features = NUMERICAL_FEATURES + CATEGORICAL_FEATURES | |
| missing_features = [col for col in all_expected_features if col not in features_df.columns] | |
| if missing_features: | |
| print(f"Error: The following expected features are missing from the data after renaming: {missing_features}") | |
| exit() | |
| #The above steps were only for the safety purpose...to recheck if there is any missing features. | |
| #Actually, I did it just because I was facing many errors...therefore just to check I added some checkpoints. | |
| numerical_imputer = SimpleImputer(strategy='mean') | |
| if NUMERICAL_FEATURES: | |
| features_df[NUMERICAL_FEATURES] = numerical_imputer.fit_transform(features_df[NUMERICAL_FEATURES]) | |
| joblib.dump(numerical_imputer, os.path.join(MODEL_OUTPUT_DIR, 'numerical_imputer.pkl')) | |
| print("Numerical imputer fitted and saved") | |
| else: | |
| print("No numerical columns to impute") | |
| categorical_imputer = SimpleImputer(strategy='most_frequent') | |
| if CATEGORICAL_FEATURES: | |
| features_df[CATEGORICAL_FEATURES] = categorical_imputer.fit_transform(features_df[CATEGORICAL_FEATURES]) | |
| joblib.dump(categorical_imputer, os.path.join(MODEL_OUTPUT_DIR, 'categorical_imputer.pkl')) | |
| print("Categorical imputer fitted and saved") | |
| else: | |
| print("No categorical columns to impute") | |
| #I used joblib because I wanted to use this data later as well...therefore, whenever I will be in need of it I will load this with joblib.load() | |
| if 'structure_type' in features_df.columns: | |
| features_df['structure_type'] = features_df['structure_type'].astype(str).str.lower().str.strip() | |
| le_structure_type = LabelEncoder() | |
| features_df['structure_type'] = le_structure_type.fit_transform(features_df['structure_type']) | |
| joblib.dump(le_structure_type, os.path.join(MODEL_OUTPUT_DIR, 'label_encoder_structure_type.pkl')) | |
| print("LabelEncoder for 'structure_type' fitted and saved.") | |
| else: | |
| print("structure_type column not found or not categorical, skipping LabelEncoder.") | |
| if NUMERICAL_FEATURES: | |
| scaler = StandardScaler() | |
| features_df[NUMERICAL_FEATURES] = scaler.fit_transform(features_df[NUMERICAL_FEATURES]) | |
| joblib.dump(scaler, os.path.join(MODEL_OUTPUT_DIR, 'scaler.pkl')) | |
| print("StandardScaler fitted and saved.") | |
| else: | |
| print("No numerical columns to scale.") | |
| #You can see that, I've used joblib.dump to create a separate directory for each imputer and encoder made | |
| X = features_df | |
| y = df[TARGET_COL] | |
| X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) | |
| model = LinearRegression() | |
| model.fit(X_train, y_train) | |
| joblib.dump(model, os.path.join(MODEL_OUTPUT_DIR, 'model.pkl')) | |
| FINAL_MODEL_EXPECTED_FEATURES = X_train.columns.tolist() | |
| print("All expected features from Final Model->\n") | |
| print(FINAL_MODEL_EXPECTED_FEATURES) | |
| #So, now, all necessary .pkl files created and saved in the current directory |