Spaces:
Build error
Build error
| # -*- coding: utf-8 -*- | |
| """ | |
| Created on Mon Jul 12 09:45:10 2021 | |
| @author: Kishore | |
| """ | |
| ######### Importing modules ############ | |
| import numpy as np | |
| from sklearn.preprocessing import LabelEncoder | |
| limit_number_of_class=10 | |
| ########### Returns a cleaned data ################### | |
| def data_clean(dataset,cols): | |
| print("data cleaning started") | |
| # dataset=data#pd.read_csv(dict['path'],header=0) | |
| # columns=cols | |
| clean_dict = {} | |
| # dropping unwanted columns | |
| dataset.drop(cols, axis=1, inplace=True) | |
| #### auto dropping ID coulmns #### | |
| auto_drop=[] | |
| for col in dataset.columns: | |
| if col not in cols: | |
| if len(dataset[col]) == dataset[col].nunique(): | |
| dataset.drop(col, axis=1, inplace=True) | |
| auto_drop.append(col) | |
| cols.append(col) | |
| #################################### | |
| clean_dict['dropped_features'] = cols | |
| clean_dict['auto_drop']=auto_drop | |
| # fname="document.txt" | |
| # f = open(fname, "a") | |
| # f.write("Documentation\n") | |
| # f.write("\n################ Data Cleaning steps ###################\n") | |
| # f.write("\n Dropped columns: "+str(dict['dropcols'])+"\n") | |
| # print(dict['dropcols']) | |
| # dropping duplicates | |
| # duplicate = dataset[dataset.duplicated()] | |
| # duplicate_rows = len(duplicate) | |
| # Dropping Duplicates | |
| dataset = dataset.drop_duplicates() | |
| # if duplicate_rows>0: | |
| # line="\n Dropped ", str(duplicate_rows) ," duplicate rows\n" | |
| # f.write(line) | |
| ############# Handling Missing values ###################################### | |
| num_data_col = dataset.select_dtypes(include=np.number).columns.tolist() | |
| cat_data_col = dataset.select_dtypes(include=['object']).columns.tolist() | |
| missing_data={} | |
| for col in num_data_col: | |
| dataset[col].fillna(dataset[col].mean(),inplace=True) | |
| missing_data[col] = dataset[col].mean() | |
| #filling missing values for categorical data | |
| for col in cat_data_col: | |
| dataset[col].fillna(dataset[col].mode()[0],inplace=True) | |
| missing_data[col] = dataset[col].mode()[0] | |
| clean_dict['missing_values']=missing_data | |
| ############################################################################### | |
| # f.write("\n Handled missing values , filled mean and mode value for numeric and categorical variable respectively \n") | |
| ############### Converting categorical to numeric values ################## | |
| labelencoder = LabelEncoder() | |
| # f.writelines("\n ......Categorical to numeric data information..... \n") | |
| # Assigning numerical values and storing in another column | |
| lst_cn=[] | |
| for col in cat_data_col: | |
| new_col = col + '_encoded' | |
| dataset[new_col] = labelencoder.fit_transform(dataset[col]) | |
| d1 = dataset.drop_duplicates(col).set_index(col) | |
| dataset[col] = dataset[new_col] | |
| dataset.drop([new_col], axis=1, inplace=True) | |
| d1.drop(d1.columns.difference([col, new_col]), 1, inplace=True) | |
| dict_map = d1.to_dict() | |
| lst_cn.append(dict_map) | |
| # print(dict_map) | |
| clean_dict['categorical_to_numeric']=lst_cn | |
| ################################################################## | |
| # print(clean_dict) | |
| # f.write("\n" + str(dict_map)) | |
| # | |
| # f.write("\n#####################################################\n") | |
| # f.close() | |
| # dict['doc_path']=fname | |
| return dataset,clean_dict | |
| # print(data.head()) | |
| # dict_param={'dropcols': ['Loan_ID', 'Gender', 'Education'], 'path':"../dataset/Loan_Approval_prediction/train.csv" } | |
| # df=data_clean(dict_param) | |
| # print(df.head()) | |