Spaces:
Sleeping
Sleeping
File size: 4,897 Bytes
e869d90 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 | """
Feature classification module for detecting data types in DataFrames.
This module provides the DetectFeatureClasses class which automatically
classifies features as Binary, Categorical, or Continuous based on their
statistical properties.
"""
import numpy as np
class DetectFeatureClasses:
"""
A class to detect feature classes in a pandas DataFrame.
Parameters:
----------
dataframe : pd.DataFrame
The input DataFrame containing features to be classified.
categorical_threshold : float, optional
The relative threshold to determine if a feature is categorical based on the ratio of unique values to total rows. Default is 0.5.
string_data_policy : str, optional
Policy for handling string data that cannot be converted to float. Options are 'drop' to drop such features or 'ignore' to leave them as is. Default is 'drop'.
Methods:
-------
feature_classes() -> dict
Classifies features into 'Binary', 'Categorical', or 'Continuous' and returns a dictionary with feature names as keys and their classes as values.
"""
def __init__(self, dataframe, categorical_threshold=0.5, string_data_policy='drop'):
"""
Initializes the DetectFeatureClasses with the provided DataFrame and parameters.
Parameters:
----------
dataframe : pd.DataFrame
The input DataFrame containing features to be classified.
categorical_threshold : float, optional
The relative threshold to determine if a feature is categorical based on the ratio of unique values to total rows. Default is 0.5.
string_data_policy : str, optional
Policy for handling string data that cannot be converted to float. Options are 'drop' to drop such features or 'ignore' to leave them as is. Default is 'drop'.
"""
self.dataframe = dataframe
self.categorical_threshold = categorical_threshold
self.string_data_policy = string_data_policy
def _binaries(self):
"""
Identifies binary features in the DataFrame.
A feature is considered binary if it has at most 2 unique values.
Returns
-------
list
A list of column names that are classified as binary features.
"""
binary_columns = [column for column in self.dataframe.columns if len(self.dataframe[column].unique()) <= 2]
return binary_columns
def _categorical(self):
"""
Identifies categorical features in the DataFrame based on the categorical threshold.
A feature is considered categorical if the number of unique values
is significantly less than the total number of rows (using the
categorical_threshold as a relative tolerance).
Returns
-------
list
A list of column names that are classified as categorical features.
"""
categorical_columns = []
for column in self.dataframe.columns:
# Check if unique count is not close to total rows (within threshold)
if np.isclose(len(
self.dataframe[column].unique()), len(self.dataframe),
rtol=self.categorical_threshold
) is False:
categorical_columns.append(column)
return categorical_columns
def feature_classes(self):
"""
Classifies features in the DataFrame into 'Binary', 'Categorical', or 'Continuous'.
Returns:
-------
dict
A dictionary with feature names as keys and their classes ('Binary', 'Categorical', 'Continuous') as values.
list
A list of features that were dropped due to string data policy.
"""
binary_columns = self._binaries()
categorical_columns = self._categorical()
features_class_types = {}
excess_columns = []
# Classify each feature
for feature in self.dataframe.columns:
if feature in binary_columns:
features_class_types[feature] = 'Binary'
elif feature in categorical_columns:
features_class_types[feature] = 'Categorical'
else:
# Try to convert to float to determine if continuous
try:
self.dataframe[feature] = self.dataframe[feature].astype(float)
features_class_types[feature] = 'Continuous'
except ValueError:
# Cannot convert to float - handle based on policy
if self.string_data_policy == 'drop':
excess_columns.append(feature)
else:
# 'ignore' policy: leave as-is (not recommended)
pass
return features_class_types, excess_columns
|