#Now, comes another important task that is Preprocessing the data
import pandas as pd
import joblib
import os
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
import requests 

GITHUB_BASE_URL = "https://raw.githubusercontent.com/jainkavya738/Electricity-ML-Artifacts/main/" 

NUM_IMPUTER_URL = "https://github.com/jainkavya738/Electricity-ML-Artifacts/raw/refs/heads/main/numerical_imputer.pkl"
CAT_IMPUTER_URL = "https://github.com/jainkavya738/Electricity-ML-Artifacts/raw/refs/heads/main/categorical_imputer.pkl"
LE_STRUCTURE_TYPE_URL = "https://github.com/jainkavya738/Electricity-ML-Artifacts/raw/refs/heads/main/label_encoder_structure_type.pkl"
SCALER_URL = "https://github.com/jainkavya738/Electricity-ML-Artifacts/raw/refs/heads/main/scaler.pkl"
MODEL_URL = "https://github.com/jainkavya738/Electricity-ML-Artifacts/raw/refs/heads/main/model.pkl" 

def download_and_load_pkl(url, filename):
    print(f"Attempting to download {filename} from {url}")
    
    temp_filepath = None 
    
    try:
        response = requests.get(url, stream=True)
        response.raise_for_status()
        
        temp_filepath = os.path.join('/tmp', filename)
         
        with open(temp_filepath, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        
        obj = joblib.load(temp_filepath)
        print(f"Successfully downloaded and loaded {filename}")
        os.remove(temp_filepath) 
        return obj
    
    except requests.exceptions.RequestException as e:
        print(f"Error downloading {temp_filepath} from {url}: {e}")
        return None 
    
    except Exception as e:
        print(f"Error loading {temp_filepath} after download: {e}")
        return None 
    
    finally: 
        if temp_filepath and os.path.exists(temp_filepath):
            try:
                os.remove(temp_filepath)
                print(f"Cleaned up temporary file: {temp_filepath}")
            except OSError as e:
                print(f"Warning: Could not remove temporary file {temp_filepath}: {e}")

numerical_imputer = download_and_load_pkl(NUM_IMPUTER_URL, "numerical_imputer.pkl")
categorical_imputer = download_and_load_pkl(CAT_IMPUTER_URL, "categorical_imputer.pkl")
le_structure_type = download_and_load_pkl(LE_STRUCTURE_TYPE_URL, "label_encoder_structure_type.pkl")
scaler = download_and_load_pkl(SCALER_URL, "scaler.pkl")
model = download_and_load_pkl(MODEL_URL, "model.pkl") 

#I have done this to set them as a placeholder in this file....therefore no discrepancies related to it will occur

#You can see that I've used the try and except model for loading the data so that if error occurs I'm completely aware of it

NUMERICAL_FEATURES = [
    'site_area', 'water_consumption', 'recycling_rate', 'utilisation_rate', 'air_qality_index', 'issue_reolution_time', 'resident_count'
]
CATEGORICAL_FEATURES = ['structure_type']

FINAL_MODEL_EXPECTED_FEATURES = [
    'site_area', 'water_consumption', 'recycling_rate', 'utilisation_rate',
    'air_qality_index', 'issue_reolution_time', 'resident_count', 'structure_type'
] 
#Final model expected features contains the list of the final output of the trained data

#Now, our input will most likely be a dictionary...but for MLOps we would be needing a Pandas datframe so I converted this input dictionary into a dataframe and then returned it to my function after performing operation ->

def preprocess_input(input_data: dict) -> pd.DataFrame:
    df_processed = pd.DataFrame([input_data])
    print(f"DataFrame after initial creation (df_processed)-> \n{df_processed}")

    if 'structure_type' in df_processed.columns:
        df_processed['structure_type'] = df_processed['structure_type'].astype(str).str.lower().str.strip()
        print(f"'structure_type' standardized to: '{df_processed['structure_type'].iloc[0]}'")

    if numerical_imputer is not None and NUMERICAL_FEATURES:
        missing_input = [col for col in NUMERICAL_FEATURES if col not in df_processed.columns] 
    
        if missing_input:
            raise ValueError(f"Error : Numerical features {missing_input} are missing from input DataFrame!")
        #This is only to verify...It will give us the missing columns which should  be present while doing numerical imputation....basically, I'm trying to handle all the errors possible
    
        try:
            df_processed[NUMERICAL_FEATURES] = numerical_imputer.transform(df_processed[NUMERICAL_FEATURES])

        except Exception as e:
            raise RuntimeError(
                f"Error during numerical imputation\n"
                f"Error : {e}"
            )
            #raise functions are best here because as soon as the error occurs....it will stop the function

    if categorical_imputer is not None and CATEGORICAL_FEATURES:
        missing_input = [col for col in CATEGORICAL_FEATURES if col not in df_processed.columns]

    if missing_input:
        raise ValueError(f"Error : Categorical features {missing_input} are missing from input DataFrame!")
    try:
        df_processed[CATEGORICAL_FEATURES] = categorical_imputer.transform(df_processed[CATEGORICAL_FEATURES])

    except Exception as e:
        raise RuntimeError(
            f"Error during categorical imputation\n"
            f"Error : {e}"
        )

    if le_structure_type is not None and 'structure_type' in df_processed.columns:
        try:
            df_processed['structure_type'] = le_structure_type.transform(df_processed['structure_type'])
        except ValueError as e:
            raise ValueError(
                f"Unknown category -> '{df_processed['structure_type'].iloc[0]}' in column 'structure_type'\n"
                f"Error : {e}"
            )
        except Exception as e:
            raise RuntimeError(f"Error during Label Encoding for 'structure_type'...Error: {e}")

    if scaler is not None and NUMERICAL_FEATURES:
        missing_input = [col for col in NUMERICAL_FEATURES if col not in df_processed.columns]

    if missing_input:
        raise ValueError(f"Error : Numerical features {missing_input} are missing from input DataFrame")

    try:
        df_processed[NUMERICAL_FEATURES] = scaler.transform(df_processed[NUMERICAL_FEATURES])

    except Exception as e:
        raise RuntimeError(
            f"Error during scaling\n"
            f"Error: {e}"
        )

    print(f"Current df_processed columns before final reorder: {df_processed.columns.tolist()}")
    #Checkpoint

    for col in FINAL_MODEL_EXPECTED_FEATURES:
        if col not in df_processed.columns:
            print(f"Adding missing column: '{col}' with value 0.")
            df_processed[col] = 0

    df_final = df_processed[FINAL_MODEL_EXPECTED_FEATURES] 
    print(f"Final DataFrame for prediction: \n{df_final}")

    return df_final

#The function I created above was based upon the numerical and categorical imputation, label encoding, scaling or basically all the data preprocessing that should be done after training all the models.....
#I have show all the error messages in my coding lines because I got stuck in this process many time and to highlight the mistakes I have created some checkpoints also in between....Therefore, now all the data operations are done and the next thing is DEPLOYMENT-> creation of FastAPI and deployment on AWS etc.