Spaces:
Build error
Build error
| import pandas as pd | |
| import numpy as np | |
| import pickle | |
| # Define the name of the pickle file containing a pre-trained data preprocessing pipeline. | |
| pipeline_pkl = "full_pipeline.pkl" | |
| # Function to load data from a pickle file. | |
| def load_pickle(filename): | |
| with open(filename, 'rb') as file: | |
| data = pickle.load(file) | |
| return data | |
| # Load the pre-processing pipeline from the pickle file. | |
| preprocessor = load_pickle(pipeline_pkl) | |
| # Function to create new columns in the training data. | |
| def create_new_columns(train_data): | |
| # Calculate 'Monthly Variations' column as the difference between 'TotalCharges' and the product of 'tenure' and 'MonthlyCharges'. | |
| train_data['Monthly Variations'] = (train_data.loc[:, 'TotalCharges']) -((train_data.loc[:, 'tenure'] * train_data.loc[:, 'MonthlyCharges'])) | |
| # Define labels for 'tenure_group' based on a range of values. | |
| labels =['{0}-{1}'.format(i, i+2) for i in range(0, 73, 3)] | |
| # Create a 'tenure_group' column by binning 'tenure' values into the specified labels. | |
| train_data['tenure_group'] = pd.cut(train_data['tenure'], bins=(range(0, 78, 3)), right=False, labels=labels) | |
| # Drop the 'tenure' column from the DataFrame. | |
| train_data.drop(columns=['tenure'], inplace=True) | |
| return train_data | |
| # Function to create a processed DataFrame from the processed data. | |
| def create_processed_dataframe(processed_data, train_data): | |
| # Select numerical columns from the training data. | |
| train_num_cols=train_data.select_dtypes(exclude=['object', 'category']).columns | |
| # Get feature names from the categorical encoder in the preprocessor. | |
| cat_features = preprocessor.named_transformers_['categorical']['cat_encoder'].get_feature_names() | |
| # Concatenate numerical and categorical feature names. | |
| labels = np.concatenate([train_num_cols, cat_features]) | |
| # Create a DataFrame from the processed data with the specified column labels. | |
| processed_dataframe = pd.DataFrame(processed_data.toarray(), columns=labels) | |
| return processed_dataframe | |