Spaces:
Build error
Build error
| import pandas as pd | |
| import numpy as np | |
| def data_imp(): | |
| insurance_feature_descriptions = { | |
| "CustID": "Unique identifier for each customer.", | |
| "FirstPolYear": "Year when the customer first bought an insurance policy.", | |
| "BirthYear": "Birth year of the customer, used to calculate age.", | |
| "EducDeg": "Highest educational degree obtained by the customer.", | |
| "MonthSal": "Monthly salary of the customer. (Numerical, float64)", | |
| "GeoLivArea": "Geographical area where the customer lives.", | |
| "Children": "Number of children the customer has.", | |
| "CustMonVal": "Total monetary value of the customer to the company.", | |
| "ClaimsRate": "Rate at which the customer files insurance claims.", | |
| "PremMotor": "Premium amount for motor insurance.", | |
| "PremHousehold": "Premium amount for household insurance.", | |
| "PremHealth": "Premium amount for health insurance.", | |
| "PremLife": "Premium amount for life insurance.", | |
| "PremWork": "Premium amount for work insurance." | |
| } | |
| retail_feature_descriptions = { | |
| "Channel": "Indicates the sales channel through which the customer made purchases.", | |
| "Region": "The geographical region where the customer is located.", | |
| "Fresh": "Annual spending (in monetary units) on fresh products.", | |
| "Milk": "Annual spending (in monetary units) on milk products.", | |
| "Grocery": "Annual spending (in monetary units) on grocery items.", | |
| "Frozen": "Annual spending (in monetary units) on frozen products.", | |
| "Detergents_Paper": "Annual spending (in monetary units) on detergents and paper products.", | |
| "Delicassen": "Annual spending (in monetary units) on delicatessen products." | |
| } | |
| bankng_feature_descriptions = { | |
| "CUST_ID": "Unique identifier for each customer.", | |
| "BALANCE": "The average balance left in the customer's account.", | |
| "BALANCE_FREQUENCY": "Frequency with which the balance is updated.", | |
| "PURCHASES": "The total amount of purchases made by the customer.", | |
| "ONEOFF_PURCHASES": "The total amount of one-time purchases made by the customer.", | |
| "INSTALLMENTS_PURCHASES": "The total amount of purchases made in installments.", | |
| "CASH_ADVANCE": "The total amount of cash advances taken by the customer.", | |
| "PURCHASES_FREQUENCY": "The frequency of purchases made by the customer.", | |
| "ONEOFF_PURCHASES_FREQUENCY": "The frequency of one-time purchases made by the customer.", | |
| "PURCHASES_INSTALLMENTS_FREQUENCY": "The frequency of purchases made in installments.", | |
| "CASH_ADVANCE_FREQUENCY": "The frequency of cash advances taken by the customer.", | |
| "CASH_ADVANCE_TRX": "The number of cash advance transactions made by the customer.", | |
| "PURCHASES_TRX": "The number of purchase transactions made by the customer.", | |
| "CREDIT_LIMIT": "The credit limit assigned to the customer's account.", | |
| "PAYMENTS": "The total amount of payments made by the customer.", | |
| "MINIMUM_PAYMENTS": "The minimum amount of payments made by the customer.", | |
| "PRC_FULL_PAYMENT": "The percentage of full payments made by the customer.", | |
| "TENURE": "The tenure of the customer in months." | |
| } | |
| insurance_defaults = { | |
| "FirstPolYear": 1999, | |
| "BirthYear": 1980, | |
| "MonthSal": 1000, | |
| "GeoLivArea": 0, # Options: 0, 1, 2, 3 | |
| "Children": 0, # Options: 0, 1, 2 | |
| "CustMonVal": 100, | |
| "ClaimsRate": 2.33, | |
| "PremMotor": 200, | |
| "PremHousehold": 200, | |
| "PremHealth": 200, | |
| "PremLife": 200, | |
| "PremWork": 200 | |
| } | |
| # Define default values for banking dataset features | |
| banking_defaults = { | |
| "BALANCE": 2000, | |
| "BALANCE_FREQUENCY": 0.5, | |
| "PURCHASES": 500, | |
| "ONEOFF_PURCHASES": 0, | |
| "INSTALLMENTS_PURCHASES": 0, | |
| "CASH_ADVANCE": 200, | |
| "PURCHASES_FREQUENCY": 0.1, | |
| "ONEOFF_PURCHASES_FREQUENCY": 0.1, | |
| "PURCHASES_INSTALLMENTS_FREQUENCY": 0.5, | |
| "CASH_ADVANCE_FREQUENCY": 5, | |
| "CASH_ADVANCE_TRX": 5, | |
| "PURCHASES_TRX": 5, | |
| "CREDIT_LIMIT": 10000, | |
| "PAYMENTS": 500, | |
| "MINIMUM_PAYMENTS": 130, | |
| "PRC_FULL_PAYMENT": 0.22, | |
| "TENURE": 10 | |
| } | |
| # Define default values for retail dataset features | |
| retail_defaults = { | |
| "Fresh": 6000, | |
| "Milk": 9000, | |
| "Grocery": 9000, | |
| "Frozen": 4000, | |
| "Detergents_Paper": 4000, | |
| "Delicassen": 2000 | |
| } | |
| return insurance_feature_descriptions,bankng_feature_descriptions,retail_feature_descriptions,insurance_defaults,banking_defaults,retail_defaults | |
| def preprocess_data(data): | |
| if 'CustID' in data.columns: | |
| data = data.drop(columns=['CustID']) | |
| if 'Cust_ID' in data.columns: | |
| data = data.drop(columns=['Cust_ID']) | |
| data = remove_outliers(data) | |
| return data | |
| def remove_outliers(df, threshold=3): | |
| df_numeric = df.select_dtypes(include=[float, int]) | |
| z_scores = np.abs((df_numeric - df_numeric.mean()) / df_numeric.std()) | |
| df_clean = df[(z_scores < threshold).all(axis=1)] | |
| return df_clean | |