""" Feature classification module for detecting data types in DataFrames. This module provides the DetectFeatureClasses class which automatically classifies features as Binary, Categorical, or Continuous based on their statistical properties. """ import numpy as np class DetectFeatureClasses: """ A class to detect feature classes in a pandas DataFrame. Parameters: ---------- dataframe : pd.DataFrame The input DataFrame containing features to be classified. categorical_threshold : float, optional The relative threshold to determine if a feature is categorical based on the ratio of unique values to total rows. Default is 0.5. string_data_policy : str, optional Policy for handling string data that cannot be converted to float. Options are 'drop' to drop such features or 'ignore' to leave them as is. Default is 'drop'. Methods: ------- feature_classes() -> dict Classifies features into 'Binary', 'Categorical', or 'Continuous' and returns a dictionary with feature names as keys and their classes as values. """ def __init__(self, dataframe, categorical_threshold=0.5, string_data_policy='drop'): """ Initializes the DetectFeatureClasses with the provided DataFrame and parameters. Parameters: ---------- dataframe : pd.DataFrame The input DataFrame containing features to be classified. categorical_threshold : float, optional The relative threshold to determine if a feature is categorical based on the ratio of unique values to total rows. Default is 0.5. string_data_policy : str, optional Policy for handling string data that cannot be converted to float. Options are 'drop' to drop such features or 'ignore' to leave them as is. Default is 'drop'. """ self.dataframe = dataframe self.categorical_threshold = categorical_threshold self.string_data_policy = string_data_policy def _binaries(self): """ Identifies binary features in the DataFrame. A feature is considered binary if it has at most 2 unique values. Returns ------- list A list of column names that are classified as binary features. """ binary_columns = [column for column in self.dataframe.columns if len(self.dataframe[column].unique()) <= 2] return binary_columns def _categorical(self): """ Identifies categorical features in the DataFrame based on the categorical threshold. A feature is considered categorical if the number of unique values is significantly less than the total number of rows (using the categorical_threshold as a relative tolerance). Returns ------- list A list of column names that are classified as categorical features. """ categorical_columns = [] for column in self.dataframe.columns: # Check if unique count is not close to total rows (within threshold) if np.isclose(len( self.dataframe[column].unique()), len(self.dataframe), rtol=self.categorical_threshold ) is False: categorical_columns.append(column) return categorical_columns def feature_classes(self): """ Classifies features in the DataFrame into 'Binary', 'Categorical', or 'Continuous'. Returns: ------- dict A dictionary with feature names as keys and their classes ('Binary', 'Categorical', 'Continuous') as values. list A list of features that were dropped due to string data policy. """ binary_columns = self._binaries() categorical_columns = self._categorical() features_class_types = {} excess_columns = [] # Classify each feature for feature in self.dataframe.columns: if feature in binary_columns: features_class_types[feature] = 'Binary' elif feature in categorical_columns: features_class_types[feature] = 'Categorical' else: # Try to convert to float to determine if continuous try: self.dataframe[feature] = self.dataframe[feature].astype(float) features_class_types[feature] = 'Continuous' except ValueError: # Cannot convert to float - handle based on policy if self.string_data_policy == 'drop': excess_columns.append(feature) else: # 'ignore' policy: leave as-is (not recommended) pass return features_class_types, excess_columns