Spaces:
Sleeping
Sleeping
| """ | |
| Feature classification module for detecting data types in DataFrames. | |
| This module provides the DetectFeatureClasses class which automatically | |
| classifies features as Binary, Categorical, or Continuous based on their | |
| statistical properties. | |
| """ | |
| import numpy as np | |
| class DetectFeatureClasses: | |
| """ | |
| A class to detect feature classes in a pandas DataFrame. | |
| Parameters: | |
| ---------- | |
| dataframe : pd.DataFrame | |
| The input DataFrame containing features to be classified. | |
| categorical_threshold : float, optional | |
| The relative threshold to determine if a feature is categorical based on the ratio of unique values to total rows. Default is 0.5. | |
| string_data_policy : str, optional | |
| Policy for handling string data that cannot be converted to float. Options are 'drop' to drop such features or 'ignore' to leave them as is. Default is 'drop'. | |
| Methods: | |
| ------- | |
| feature_classes() -> dict | |
| Classifies features into 'Binary', 'Categorical', or 'Continuous' and returns a dictionary with feature names as keys and their classes as values. | |
| """ | |
| def __init__(self, dataframe, categorical_threshold=0.5, string_data_policy='drop'): | |
| """ | |
| Initializes the DetectFeatureClasses with the provided DataFrame and parameters. | |
| Parameters: | |
| ---------- | |
| dataframe : pd.DataFrame | |
| The input DataFrame containing features to be classified. | |
| categorical_threshold : float, optional | |
| The relative threshold to determine if a feature is categorical based on the ratio of unique values to total rows. Default is 0.5. | |
| string_data_policy : str, optional | |
| Policy for handling string data that cannot be converted to float. Options are 'drop' to drop such features or 'ignore' to leave them as is. Default is 'drop'. | |
| """ | |
| self.dataframe = dataframe | |
| self.categorical_threshold = categorical_threshold | |
| self.string_data_policy = string_data_policy | |
| def _binaries(self): | |
| """ | |
| Identifies binary features in the DataFrame. | |
| A feature is considered binary if it has at most 2 unique values. | |
| Returns | |
| ------- | |
| list | |
| A list of column names that are classified as binary features. | |
| """ | |
| binary_columns = [column for column in self.dataframe.columns if len(self.dataframe[column].unique()) <= 2] | |
| return binary_columns | |
| def _categorical(self): | |
| """ | |
| Identifies categorical features in the DataFrame based on the categorical threshold. | |
| A feature is considered categorical if the number of unique values | |
| is significantly less than the total number of rows (using the | |
| categorical_threshold as a relative tolerance). | |
| Returns | |
| ------- | |
| list | |
| A list of column names that are classified as categorical features. | |
| """ | |
| categorical_columns = [] | |
| for column in self.dataframe.columns: | |
| # Check if unique count is not close to total rows (within threshold) | |
| if np.isclose(len( | |
| self.dataframe[column].unique()), len(self.dataframe), | |
| rtol=self.categorical_threshold | |
| ) is False: | |
| categorical_columns.append(column) | |
| return categorical_columns | |
| def feature_classes(self): | |
| """ | |
| Classifies features in the DataFrame into 'Binary', 'Categorical', or 'Continuous'. | |
| Returns: | |
| ------- | |
| dict | |
| A dictionary with feature names as keys and their classes ('Binary', 'Categorical', 'Continuous') as values. | |
| list | |
| A list of features that were dropped due to string data policy. | |
| """ | |
| binary_columns = self._binaries() | |
| categorical_columns = self._categorical() | |
| features_class_types = {} | |
| excess_columns = [] | |
| # Classify each feature | |
| for feature in self.dataframe.columns: | |
| if feature in binary_columns: | |
| features_class_types[feature] = 'Binary' | |
| elif feature in categorical_columns: | |
| features_class_types[feature] = 'Categorical' | |
| else: | |
| # Try to convert to float to determine if continuous | |
| try: | |
| self.dataframe[feature] = self.dataframe[feature].astype(float) | |
| features_class_types[feature] = 'Continuous' | |
| except ValueError: | |
| # Cannot convert to float - handle based on policy | |
| if self.string_data_policy == 'drop': | |
| excess_columns.append(feature) | |
| else: | |
| # 'ignore' policy: leave as-is (not recommended) | |
| pass | |
| return features_class_types, excess_columns | |